COMPMID-3776: Indirect GEMM

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I51a1b0f098bc3a8c408c50c92221e4df3061e12c Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4343 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Sang-Hoon Park <sang-hoon.park@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2020-11-02 01:37:17 +0000
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2020-11-12 15:59:25 +0000
commit: c0b6f76561580414f08633a804fc548ccad65659 (patch)
tree: 4d46b7f479de04f799e29095392948aeb370c029
parent: 824061d9910ebb42cbe46b677c0b843db212c9a2 (diff)
download: ComputeLibrary-c0b6f76561580414f08633a804fc548ccad65659.tar.gz
235 files changed, 79188 insertions, 53560 deletions
diff --git a/Android.bp b/Android.bp
index 8d931c23c8..98b00cf5ba 100644
--- a/Android.bp
+++ b/Android.bp
@@ -367,10 +367,12 @@ cc_library_static {
         "src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp",
+        "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp",
         "src/core/NEON/kernels/arm_gemm/mergeresults.cpp",
         "src/core/NEON/kernels/arm_gemm/misc.cpp",
         "src/core/NEON/kernels/arm_gemm/quantized.cpp",
-        "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp",
+        "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp",
+        "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp",
         "src/core/NEON/kernels/convolution/common/padding.cpp",
         "src/core/NEON/kernels/convolution/common/qasymm8.cpp",
         "src/core/NEON/kernels/convolution/common/qsymm8.cpp",
@@ -669,9 +671,9 @@ cc_library_static {
         "src/runtime/NEON/functions/NEFuseBatchNormalization.cpp",
         "src/runtime/NEON/functions/NEGEMM.cpp",
         "src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp",
+        "src/runtime/NEON/functions/NEGEMMConv2d.cpp",
         "src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp",
         "src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp",
-        "src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp",
         "src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp",
         "src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp",
         "src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp",
@@ -727,7 +729,6 @@ cc_library_static {
         "src/runtime/NEON/functions/NEScale.cpp",
         "src/runtime/NEON/functions/NEScharr3x3.cpp",
         "src/runtime/NEON/functions/NESelect.cpp",
-        "src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp",
         "src/runtime/NEON/functions/NESlice.cpp",
         "src/runtime/NEON/functions/NESobel3x3.cpp",
         "src/runtime/NEON/functions/NESobel5x5.cpp",
@@ -779,69 +780,71 @@ cc_library_static {
         },
         arm64: {
             srcs: [
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp",
                 
             ],
         },
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 306bdc6706..2e639c4be4 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -137,10 +137,11 @@ enum class DataLayoutDimension
 /** Available ConvolutionMethod*/
 enum class ConvolutionMethod
 {
-    GEMM,     /**< Convolution using GEMM */
-    DIRECT,   /**< Direct convolution */
-    WINOGRAD, /**< Convolution using Winograd */
-    FFT       /**< Convolution using FFT */
+    GEMM,        /**< Convolution using GEMM */
+    GEMM_CONV2D, /**< Direct 2D GEMM convolution */
+    DIRECT,      /**< Direct convolution */
+    WINOGRAD,    /**< Convolution using Winograd */
+    FFT          /**< Convolution using FFT */
 };
 
 /** Available DepthwiseConvolutionFunction*/
diff --git a/arm_compute/runtime/FunctionDescriptors.h b/arm_compute/runtime/FunctionDescriptors.h
index 16d6c345e2..1f4216eb21 100644
--- a/arm_compute/runtime/FunctionDescriptors.h
+++ b/arm_compute/runtime/FunctionDescriptors.h
@@ -23,6 +23,9 @@
  */
 #ifndef ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
 #define ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
+
+#include "arm_compute/core/Types.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -48,5 +51,26 @@ struct FFT2DInfo
     unsigned int axis1{ 1 };                         /**< Axis to run second pass on. If same, multiple transforms are performed on single axis*/
     FFTDirection direction{ FFTDirection::Forward }; /**< Direction of the FFT. */
 };
+
+/** Descriptor used by the Convolution function */
+struct Conv2dInfo
+{
+    Conv2dInfo() = default;
+
+    Conv2dInfo(const PadStrideInfo       &conv_info,
+               const Size2D              &dilation,
+               const ActivationLayerInfo &act_info,
+               bool                       enable_fast_math,
+               unsigned int               num_groups)
+        : conv_info(conv_info), dilation(dilation), act_info(act_info), enable_fast_math(enable_fast_math), num_groups(num_groups)
+    {
+    }
+
+    PadStrideInfo       conv_info{};
+    Size2D              dilation{ 1U, 1U };
+    ActivationLayerInfo act_info{};
+    bool                enable_fast_math{ false };
+    unsigned int        num_groups{ 1 };
+};
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H */
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index a97fa3b81a..e7d59e1608 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -78,9 +78,9 @@
 #include "arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index 54dae57752..a061dc7b04 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -26,16 +26,15 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
+
 #include <memory>
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
 
 /** Basic function to simulate a convolution layer. This function calls one of the following NEON functions:
@@ -158,5 +157,5 @@ private:
     std::shared_ptr<IMemoryManager> _memory_manager;
     std::unique_ptr<IFunction>      _function; /**< Function to run */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NECONVOLUTIONLAYER_H */
 \ No newline at end of file
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
index ac77acf69d..8f9498d0f5 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
@@ -32,6 +32,28 @@
 
 namespace arm_compute
 {
+/* Convolution method supported by the assembly gemm interface */
+enum class AsmConvMethod
+{
+    Im2Col,
+    Indirect,
+    Conv
+};
+
+struct AsmGemmInfo
+{
+    AsmConvMethod           method{ AsmConvMethod::Im2Col };
+    PadStrideInfo           ps_info{};
+    ActivationLayerInfo     activation_info{};
+    GEMMLowpOutputStageInfo output_stage{};
+    bool                    negated_offsets{ true };
+    bool                    reinterpret_input_as_3d{ false };
+    bool                    depth_output_gemm3d{ false };
+    int64_t                 padding_top{ 0 };
+    int64_t                 padding_left{ 0 };
+    float                   padding_value{ 0.f };
+};
+
 /** Assembly kernel glue */
 class NEGEMMAssemblyDispatch : public IFunction
 {
@@ -55,33 +77,28 @@ public:
         virtual ~IFallback()               = default;
     };
 
-private:
-    /** Interface for the arm_gemm fallback */
-    std::unique_ptr<IFallback> _arm_gemm;
-    MemoryGroup                _memory_group;    /**< Function memory group */
-    IWeightsManager           *_weights_manager; /**< Pointer to the weights manager */
 public:
     /** If supported create a Compute Library function else fallback to the arm_gemm function.
      *
-     * @param[in]  a         Input tensor (Matrix A)
-     * @param[in]  b         Input tensor (Matrix B)
-     * @param[in]  c         Input tensor (Matrix C) used to pass the bias for quantized calculations
-     * @param[out] d         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in]  gemm_info GEMM meta-data
+     * @param[in]  a    Input tensor (Matrix A)
+     * @param[in]  b    Input tensor (Matrix B)
+     * @param[in]  c    Input tensor (Matrix C) used to pass the bias for quantized calculations
+     * @param[out] d    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+     * @param[in]  info GEMM meta-data
      */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info);
+    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info);
 
     /** Indicates whether or not this function can be used to process the given parameters.
      *
-     * @param[in] a         Input tensor info (Matrix A)
-     * @param[in] b         Input tensor info (Matrix B)
-     * @param[in] c         Input tensor info (Matrix C) used to pass the bias for quantized calculations
-     * @param[in] d         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in] gemm_info GEMM meta-data
+     * @param[in] a    Input tensor info (Matrix A)
+     * @param[in] b    Input tensor info (Matrix B)
+     * @param[in] c    Input tensor info (Matrix C) used to pass the bias for quantized calculations
+     * @param[in] d    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+     * @param[in] info GEMM meta-data
      *
      * @return a status.
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info);
+    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
     /** Checks if activation is supported by the gemm assembly dispatcher
      *
      * @param[in] activation Activation to check
@@ -94,10 +111,15 @@ public:
      * @return True if the function is configured and ready to run
      */
     bool is_configured() const;
+
     // Inherited methods overridden:
-    /** Runs a preparation step, usually for pre-transposing matrix b */
     void prepare() override;
     void run() override;
+
+private:
+    std::unique_ptr<IFallback> _arm_gemm;        /** Interface for the arm_gemm fallback */
+    MemoryGroup                _memory_group;    /**< Function memory group */
+    IWeightsManager           *_weights_manager; /**< Pointer to the weights manager */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEGEMMASSEMBLYDISPATCH_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
new file mode 100644
index 0000000000..7cae39397f
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMCONV2D_H
+#define ARM_COMPUTE_NEGEMMCONV2D_H
+
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+/** Basic function to compute the convolution layer. This function calls the following NEON kernels/functions:
+ *
+ * Supports only NHWC data layout
+ *
+ * -# @ref NEGEMMAssemblyDispatch
+ * -# @ref NEActivationLayer, in case activation cannot be fused in the assembly dispatch
+ *
+ * Weights are transformed from OHWI to HWIO format using the following kernels:
+ * -# @ref NEPermute
+ */
+class NEGEMMConv2d : public IFunction
+{
+public:
+    /** Constructor */
+    NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMConv2d(const NEGEMMConv2d &) = delete;
+    /** Default move constructor */
+    NEGEMMConv2d(NEGEMMConv2d &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMConv2d &operator=(const NEGEMMConv2d &) = delete;
+    /** Default move assignment operator */
+    NEGEMMConv2d &operator=(NEGEMMConv2d &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                     while every optional dimension from 4 and above represent a batch of inputs.
+     *                     Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in]  weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                     Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in]  biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                     Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] output  Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                     Data types supported: Same as @p input.
+     * @param[in]  info    Convolution layer descriptor
+     */
+    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConv2d
+     *
+     * @param[in] input   Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                    while every optional dimension from 4 and above represent a batch of inputs.
+     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                    Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in] biases  Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                    Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[in] output  Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                    Data types supported: Same as @p input.
+     * @param[in] info    Contains padding and stride information described in @ref PadStrideInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info);
+
+    // Inherited methods overridden:
+    void run() override;
+    void prepare() override;
+
+private:
+    NEGEMMAssemblyDispatch _gemm_asm_func;
+    NEActivationLayer      _activation_func;
+    NEPermute              _weights_permute_func;
+    const ITensor         *_original_weights;
+    Tensor                 _permuted_weights;
+    bool                   _is_prepared;
+    bool                   _run_activation;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEGEMMCONV2D_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
deleted file mode 100644
index 961b1901e7..0000000000
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H
-#define ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H
-
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-class NEGEMMInterleave4x4Kernel;
-class NEGEMMTranspose1xWKernel;
-class NEGEMMLowpMatrixMultiplyKernel;
-
-/** Basic function to execute matrix multiply assembly kernels. */
-class NEGEMMLowpAssemblyMatrixMultiplyCore : public IFunction
-{
-public:
-    /** Constructor */
-    NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Destructor */
-    ~NEGEMMLowpAssemblyMatrixMultiplyCore();
-
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  a      First input tensor  (Matrix A). Data type supported: U8, S8.
-     * @param[in]  b      Second input tensor (Matrix B). Data type supported: same as @p a
-     * @param[in]  c      Third input tensor (Matrix C). Data type supported: same as @p a
-     * @param[out] output Output tensor. Data type supported: Data type supported: U32, S32
-     */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                                     _memory_group;
-    NEGEMMAssemblyDispatch                          _asm_glue;
-    std::unique_ptr<NEGEMMLowpMatrixMultiplyKernel> _mm_kernel;
-    std::unique_ptr<NEGEMMInterleave4x4Kernel>      _mtx_a_reshape_kernel;
-    std::unique_ptr<NEGEMMTranspose1xWKernel>       _mtx_b_reshape_kernel;
-    Tensor                                          _tmp_a;
-    Tensor                                          _tmp_b;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H */
diff --git a/docs/06_functions_list.dox b/docs/06_functions_list.dox
index ac944610dc..e6924211e2 100644
--- a/docs/06_functions_list.dox
+++ b/docs/06_functions_list.dox
@@ -141,8 +141,8 @@ namespace arm_compute
         - @ref NEGaussianPyramidOrb
     - @ref NEGEMM
     - @ref NEGEMMAssemblyDispatch
+    - @ref NEGEMMConv2d
     - @ref NEGEMMConvolutionLayer
-    - @ref NEGEMMLowpAssemblyMatrixMultiplyCore
     - @ref NEGEMMLowpMatrixMultiplyCore
     - @ref NEGenerateProposalsLayer
     - @ref NEHarrisCorners
@@ -173,7 +173,6 @@ namespace arm_compute
     - @ref NERNNLayer
     - @ref NEROIPoolingLayer
     - @ref NEScale
-    - @ref NESimpleAssemblyFunction
     - @ref NESobel5x5
     - @ref NESobel7x7
     - @ref NESoftmaxLayerGeneric &lt;IS_LOG&gt;
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 67562933d4..79c4bcea25 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -72,7 +72,6 @@
 #include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
 #include "src/core/NEON/kernels/NEFloorKernel.h"
 #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
-#include "src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
 #include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
 #include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
diff --git a/src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
deleted file mode 100644
index 775a2c06ab..0000000000
--- a/src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMASSEMBLYBASE_H
-#define ARM_COMPUTE_NEGEMMASSEMBLYBASE_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Base class for GEMM NEON kernels implemented in Assembly. */
-class NEGEMMAssemblyBaseKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMAssemblyBaseKernel";
-    }
-    /** Constructor */
-    NEGEMMAssemblyBaseKernel()
-        : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _is_transposed_0(false), _is_transposed_1(false)
-    {
-    }
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMAssemblyBaseKernel(const NEGEMMAssemblyBaseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMAssemblyBaseKernel &operator=(const NEGEMMAssemblyBaseKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMAssemblyBaseKernel(NEGEMMAssemblyBaseKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMAssemblyBaseKernel &operator=(NEGEMMAssemblyBaseKernel &&) = default;
-
-    virtual ~NEGEMMAssemblyBaseKernel() = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * The computed function is C = a * AxB + b * C.
-     *
-     * @param[in]     input0          Input tensor containing the Matrix A. Data types supported: F32
-     * @param[in]     input1          Input tensor containing the Matrix B. Data types supported: same as @p input0
-     * @param[in,out] output          Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0.
-     * @param[out]    workspace       Space for intermediate results.
-     * @param[in]     alpha           Weight of the matrix product
-     * @param[in]     beta            Weight of the accumulation.
-     * @param[in]     is_transposed_0 (Optional)True if @p input0 is transposed else false. (Defaults to false)
-     * @param[in]     is_transposed_1 (Optional)True if @p input1 is transposed else false. (Defaults to false)
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool is_transposed_0 = false, bool is_transposed_1 = false)
-    {
-        internal_configure(input0, input1, output, workspace, alpha, beta, is_transposed_0, is_transposed_1);
-    }
-
-protected:
-    virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool _is_transposed_0, bool _is_transposed_1) = 0;
-
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
-    ITensor       *_workspace;
-    float          _alpha;
-    float          _beta;
-    bool           _is_transposed_0;
-    bool           _is_transposed_1;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMASSEMBLYBASE_H*/
diff --git a/src/core/NEON/kernels/arm_gemm/convolver.hpp b/src/core/NEON/kernels/arm_gemm/convolver.hpp
new file mode 100644
index 0000000000..1cd959523f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/convolver.hpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "convolution_parameters.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <tuple>
+#include <vector>
+
+namespace arm_gemm {
+
+// Class to assist with convolution calculations.
+//
+// This is framed as a hierarchy of objects:
+//
+//  - Top level object which depends only on convolution parameters.  This sets up std::vectors for the padding and
+//    kernel offset arrays.  From this you can request:
+//
+//  - Mid level object (e.g. instantiated at start of 'ConvolutionInterleave').  This holds specifics about the
+//    input tensor, and the desired column range.  Calculations specific to this can be done once when this is set
+//    up.  From this you can request:
+//
+//  - Low level object (instantiated for each range of rows).  This contains methods to actually populate a row
+//    pointer array.
+
+
+template<typename T>
+class convolver {
+private:
+    const ConvolutionParameters  m_params;
+
+    // Vector of padding data
+    const std::vector<T>         m_pad_row;
+
+    // X/Y offsets for each kernel position
+    std::vector<int>             m_kernel_y;
+    std::vector<int>             m_kernel_x;
+
+    class column_handler {
+    private:
+        const convolver<T>          &m_parent;
+
+        // Base/stride of input image
+        const T * const              m_input_base;
+        const size_t                 m_input_stride;
+
+        // Starting kernel point and channel offset within that point
+        const unsigned int           m_start_pos;
+        const unsigned int           m_start_offset;
+
+        // Total length to process, rounded length of each input channel block.
+        const unsigned int           m_length;
+        const unsigned int           m_rounded_stringlen;
+
+        class row_handler {
+        private:
+            const convolver<T>          &m_convolver;
+            const column_handler        &m_parent;
+
+            // These variables track progress through the current block of rows
+            unsigned int                 m_start_output_y=0;
+            unsigned int                 m_start_output_x=0;
+
+            unsigned int                 m_length_remaining=0;
+            unsigned int                 m_current_pos=0;
+
+            unsigned int                 m_active_height=0;
+
+        public:
+            row_handler(const column_handler &parent, unsigned int start_row, unsigned int active_height) :
+                m_convolver(parent.m_parent),
+                m_parent(parent),
+                m_start_output_y(start_row / m_convolver.m_params.output_width),
+                m_start_output_x(start_row % m_convolver.m_params.output_width),
+                m_length_remaining(m_parent.m_length),
+                m_current_pos(m_parent.m_start_pos),
+                m_active_height(active_height) { }
+
+            bool finished() const {
+                return (m_length_remaining == 0);
+            }
+
+            std::tuple<unsigned int, unsigned int> next_block(const T ** const row_ptr) {
+                if (finished()) {
+                    return { 0, 0 };
+                }
+
+                // "in_width" in the amount of data that will be read in (copied)
+                // "out_width" is the total amount of data that will be produced (including padding)
+                unsigned int offset = (m_current_pos == m_parent.m_start_pos) ? m_parent.m_start_offset : 0;
+                unsigned int in_width = std::min(m_length_remaining, static_cast<unsigned int>(m_convolver.m_params.input_channels) - offset);
+                unsigned int out_width = std::min(m_length_remaining, m_parent.m_rounded_stringlen - offset);
+
+                unsigned int output_y = m_start_output_y;
+                unsigned int output_x = m_start_output_x;
+
+                for (unsigned int row=0; row<m_active_height; row++) {
+                    int input_y = (output_y * m_convolver.m_params.output_stride_h) + m_convolver.m_kernel_y[m_current_pos];
+                    int input_x = (output_x * m_convolver.m_params.output_stride_w) + m_convolver.m_kernel_x[m_current_pos];
+
+                    // Out-of-bounds points will read the padding data,
+                    // otherwise find the correct address in the input image.
+                    if (input_y < 0 || input_y >= m_convolver.m_params.input_height || input_x < 0 || input_x >= m_convolver.m_params.input_width) {
+                        row_ptr[row] = m_convolver.m_pad_row.data();
+                    } else {
+                        row_ptr[row] = m_parent.m_input_base + ((input_y * m_convolver.m_params.input_width) + input_x) * m_parent.m_input_stride;
+                    }
+
+                    output_x++;
+                    if (output_x == m_convolver.m_params.output_width) {
+                        output_y++;
+                        output_x=0;
+                    }
+                }
+
+                m_current_pos++;
+                m_length_remaining-=out_width;
+
+                return { in_width, offset };
+            }
+        }; // end of "row handler" class
+
+    public:
+        column_handler(const convolver<T> &parent, const T *input_base, size_t input_stride,
+                       unsigned int k_start, unsigned int k_end, unsigned int rounded_stringlen)
+                     : m_parent(parent), m_input_base(input_base), m_input_stride(input_stride),
+                       m_start_pos(k_start / rounded_stringlen),
+                       m_start_offset(k_start % rounded_stringlen),
+                       m_length(k_end - k_start),
+                       m_rounded_stringlen(rounded_stringlen) { }
+
+        row_handler process_rows(unsigned int start_row, unsigned int active_height) const {
+            return row_handler(*this, start_row, active_height);
+        }
+    }; // end of "column handler" class
+
+public:
+    convolver(ConvolutionParameters params) :
+        m_params (params), m_pad_row(params.input_channels, static_cast<T>(params.padding_value)),
+        m_kernel_y(params.kernel_width * params.kernel_height, 0),
+        m_kernel_x(params.kernel_width * params.kernel_height, 0) {
+
+        // Kernel points are addressed across, then down (assumed weight layout is WHIO)
+        for (unsigned int ky=0; ky<params.kernel_height; ky++) {
+            for (unsigned int kx=0; kx<params.kernel_width; kx++) {
+                unsigned int n = (ky * params.kernel_width) + kx;
+                m_kernel_y[n] = ky - params.padding_top;
+                m_kernel_x[n] = kx - params.padding_left;
+            }
+        }
+    }
+
+    column_handler process_columns(const T *input_base, size_t input_stride,
+                                   unsigned int k_start, unsigned int k_end, unsigned int rounded_stringlen) const {
+        return column_handler(*this, input_base, input_stride, k_start, k_end, rounded_stringlen);
+    }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
index f3b66528a4..96b9734221 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
@@ -25,93 +25,78 @@
 #include "bfloat.hpp"
 #include "gemm_common.hpp"
 #include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
 #include "gemv_batched.hpp"
 #include "gemv_pretransposed.hpp"
 
-#include "kernels/a64_interleaved_bf16fp32_dot_12x8.hpp"
-#include "kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp"
-#include "kernels/a64_sgemm_12x8.hpp"
+#include "kernels/a64_hybrid_bf16fp32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_bf16fp32_dot_8x12.hpp"
+#include "kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp"
+#include "kernels/a64_sgemm_8x12.hpp"
 #include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp"
-#include "kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp"
-#include "kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp"
-#include "kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp"
-#include "kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp"
-#include "kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp"
+#include "kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp"
+#include "kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<bfloat16, float> gemm_bf16_methods[] =
 {
 #ifdef V8P6_BF
-# ifdef __ARM_FEATURE_SVE
-{
-    GemmMethod::GEMM_HYBRID,
-    "hybrid_bf16fp32_mmla_6VLx2",
-    [](const GemmArgs &args) { return (args._Ksize>=8); },
-    [](const GemmArgs &args) { return ((args._Msize <= 4) && (args._Nsize <= hybrid_bf16fp32_mmla_6VLx2::out_width())); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_mmla_6VLx2, bfloat16, float>(args); }
-},
-{
-    GemmMethod::GEMM_HYBRID,
-    "hybrid_bf16fp32_mmla_8VLx2",
-    [](const GemmArgs &args) { return (args._Ksize>=8); },
-    [](const GemmArgs &args) { return (args._Msize <= 4); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_mmla_8VLx2, bfloat16, float>(args); }
-},
-{
-    GemmMethod::GEMM_HYBRID,
-    "hybrid_bf16fp32_mmla_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize>=8); },
-    [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_mmla_4VLx4, bfloat16, float>(args); }
-},
-{
-    GemmMethod::GEMM_HYBRID,
-    "hybrid_bf16fp32_dot_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize>=8); },
-    [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_dot_4VLx4, bfloat16, float>(args); }
-},
+#ifdef __ARM_FEATURE_SVE
 { // gemm_bf16_interleaved
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_bf16fp32_mmla_3VLx8",
+    "sve_interleaved_bf16fp32_mmla_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_bf16fp32_mmla_3VLx8, bfloat16, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, bfloat16, float>(args); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_bf16fp32_dot_6x4VL",
+    nullptr,
+    [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_bf16fp32_dot_6x4VL, bfloat16, float>(args); }
 },
 { // gemm_bf16_interleaved
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_bf16fp32_dot_3VLx8",
+    "sve_interleaved_bf16fp32_dot_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>2); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_bf16fp32_dot_3VLx8, bfloat16, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_dot_8x3VL, bfloat16, float>(args); }
 },
 # endif // SVE
 { // gemm_bf16_interleaved
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_bf16fp32_mmla_12x8",
+    "a64_interleaved_bf16fp32_mmla_8x12",
     [](const GemmArgs &args) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_bf16fp32_mmla_12x8, bfloat16, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, bfloat16, float>(args); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_bf16fp32_dot_6x16",
+    nullptr,
+    nullptr,
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_bf16fp32_dot_6x16, bfloat16, float>(args); }
 },
 { // gemm_bf16_interleaved
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_bf16fp32_dot_12x8",
+    "a64_interleaved_bf16fp32_dot_8x12",
     [](const GemmArgs &args) { return (args._Ksize>2); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_bf16fp32_dot_12x8, bfloat16, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
 },
 #endif // V8P6_BF
 #ifdef __aarch64__
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "sgemm_12x8",
+    "a64_sgemm_8x12",
     nullptr,
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, bfloat16, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, bfloat16, float>(args); }
 },
 #elif defined(__arm__)
 {
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index 91012218e5..de2e4f2c2b 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -29,15 +29,17 @@
 
 #include "gemm_common.hpp"
 #include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
 #include "gemm_interleaved_pretransposed_2d.hpp"
 
 #include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/a64_hgemm_24x8.hpp"
-#include "kernels/a64_sgemm_12x8.hpp"
-#include "kernels/sve_hybrid_fp16_mla_4VLx4.hpp"
-#include "kernels/sve_interleaved_fp16_mla_3VLx8.hpp"
+#include "kernels/a64_hgemm_8x24.hpp"
+#include "kernels/a64_hybrid_fp16_mla_6x32.hpp"
+#include "kernels/a64_sgemm_8x12.hpp"
+#include "kernels/sve_hybrid_fp16_mla_6x4VL.hpp"
+#include "kernels/sve_interleaved_fp16_mla_8x3VL.hpp"
 
 namespace arm_gemm {
 
@@ -45,61 +47,51 @@ static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
 #if defined(__ARM_FEATURE_SVE)
 {
     GemmMethod::GEMM_HYBRID,
-    "hybrid_fp16_mla_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize >= 8); },
+    "sve_hybrid_fp16_mla_6x4VL",
+    nullptr,
     [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp16_mla_4VLx4, __fp16, __fp16>(args); }
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_fp16_mla_3VLx8",
+    "sve_interleaved_fp16_mla_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize > 4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp16_mla_8x3VL, __fp16, __fp16>(args); }
 },
 #endif
 
 #if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
 {
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "hgemm_24x8_2d",
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_fp16_mla_6x32",
 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     [](const GemmArgs &args) { return args._ci->has_fp16(); },
 #else
     nullptr,
 #endif
-    [](const GemmArgs &args) { return args._maxthreads >= 8; },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<hgemm_24x8, __fp16, __fp16>(args); }
+    [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "hgemm_24x8_1d",
+    "a64_hgemm_8x24",
 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     [](const GemmArgs &args) { return args._ci->has_fp16(); },
 #else
     nullptr,
 #endif
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16>(args); }
 },
-
 #endif // aarch64 && FP16
 #ifdef __aarch64__
-//Pretranpose, 2D split
-{
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "sgemm_12x8_2d",
-    nullptr,
-    [](const GemmArgs &args) { return args._maxthreads >= 8; },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, __fp16, __fp16>(args); }
-},
-//Tranpose, 1D split, with blockmanager
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "sgemm_12x8_1d",
+    "a64_sgemm_8x12",
     nullptr,
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, __fp16, __fp16>(args); }
 },
 #elif defined(__arm__)
 {
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index ddb438f06c..e9e335f500 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -24,6 +24,7 @@
 #include "arm_gemm.hpp"
 #include "gemm_common.hpp"
 #include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
 #include "gemm_interleaved_pretransposed_2d.hpp"
@@ -31,127 +32,130 @@
 #include "gemv_pretransposed.hpp"
 
 #include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/a64_hybrid_fp32_mla_16x4.hpp"
-#include "kernels/a64_hybrid_fp32_mla_4x8.hpp"
-#include "kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp"
-#include "kernels/a64_sgemm_12x8.hpp"
-#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_gemv_fp32_mla_32.hpp"
+#include "kernels/a64_hybrid_fp32_mla_6x16.hpp"
+#include "kernels/a64_hybrid_fp32_mla_8x4.hpp"
+#include "kernels/a64_sgemm_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp"
 
-#include "kernels/sve_hybrid_fp32_mla_4VLx4.hpp"
-#include "kernels/sve_hybrid_fp32_mmla_4VLx4.hpp"
-#include "kernels/sve_interleaved_fp32_mla_3VLx8.hpp"
-#include "kernels/sve_interleaved_fp32_mmla_3VLx8.hpp"
-#include "kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp"
+#include "kernels/sve_gemv_fp32_mla_8VL.hpp"
+#include "kernels/sve_hybrid_fp32_mla_6x4VL.hpp"
+#include "kernels/sve_hybrid_fp32_mla_8x1VL.hpp"
+#include "kernels/sve_interleaved_fp32_mla_8x3VL.hpp"
+#include "kernels/sve_interleaved_fp32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<float, float> gemm_fp32_methods[] =
 {
+// GEMV cases - starting with 'gemv_batched' wrapper to turn batched GEMV into GEMM.
 {
     GemmMethod::GEMV_BATCHED,
     "gemv_batched",
-    [](const GemmArgs &args) { return (args._Msize==1) && (args._nbatches>1); },
+    [](const GemmArgs &args) { return args._Msize==1 && args._nbatches>1 && !args._indirect_input; },
     nullptr,
     [](const GemmArgs &args) { return new GemvBatched<float, float>(args); }
 },
 #ifdef __aarch64__
+#ifdef __ARM_FEATURE_SVE
 {
-    GemmMethod::GEMV_PRETRANSPOSED,
-    "sgemv_pretransposed",
-    [](const GemmArgs &args) { return (args._Msize==1 && args._nbatches==1); },
+    GemmMethod::GEMM_HYBRID,
+    "sve_gemv_fp32_mla_8VL",
+    [](const GemmArgs &args) { return args._Msize==1 && args._nbatches==1 && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemvPretransposed<sgemv_pretransposed, float, float>(args); }
+    [](const GemmArgs &args) { return new GemvPretransposed<cls_sve_gemv_fp32_mla_8VL, float, float>(args); }
 },
-#if defined(__ARM_FEATURE_SVE) && defined(MMLA_FP32)
+#endif
 {
     GemmMethod::GEMM_HYBRID,
-    "hybrid_fp32_mmla_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize >= 4); },
-    [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mmla_4VLx4, float, float>(args); }
+    "a64_gemv_fp32_mla_32",
+    [](const GemmArgs &args) { return args._Msize==1 && args._nbatches==1 && !args._indirect_input; },
+    nullptr,
+    [](const GemmArgs &args) { return new GemvPretransposed<cls_a64_gemv_fp32_mla_32, float, float>(args); }
 },
+
+// MMLA next due to higher throughput (SVE only)
+#if defined(__ARM_FEATURE_SVE) && defined(MMLA_FP32)
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_fp32_mmla_3VLx8",
+    "sve_interleaved_fp32_mmla_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mmla_3VLx8, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mmla_8x3VL, float, float>(args); }
 },
 #endif // __ARM_FEATURE_SVE && MMLA_FP32
 
 #ifdef __ARM_FEATURE_SVE
-// SVE smallk /  hybrid methods
+// SVE smallk / hybrid methods
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_fp32_mla_1VLx8",
-    [](const GemmArgs &args) { return (args._Ksize <= 24); },
+    "sve_smallK_hybrid_fp32_mla_8x1VL",
+    [](const GemmArgs &args) { return args._Ksize <= 24 && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_1VLx8, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_fp32_mla_8x1VL, float, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "hybrid_fp32_mla_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize >= 4); },
+    "sve_hybrid_fp32_mla_8x1VL",
+    nullptr,
+    [](const GemmArgs &args) { return (args._Nsize < 12); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_8x1VL, float, float>(args); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_fp32_mla_6x4VL",
+    nullptr,
     [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_4VLx4, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float>(args); }
 },
 #endif // __ARM_FEATURE_SVE
 
 // NEON hybrid methods
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_fp32_mla_4x8",
-    [](const GemmArgs &args) { return (args._Ksize <= 8) && (args._Nsize % 4)==0; },
+    "a64_smallK_hybrid_fp32_mla_8x4",
+    [](const GemmArgs &args) { return args._Ksize <= 8 && (args._Nsize % 4)==0 && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_4x8, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_fp32_mla_8x4, float, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_fp32_mla_4x6",
-    [](const GemmArgs &args) { return (args._Ksize > 8) && (args._Ksize <= 16) && (args._Nsize % 4)==0; },
+    "a64_smallK_hybrid_fp32_mla_6x4",
+    [](const GemmArgs &args) { return (args._Ksize > 8 && args._Ksize <= 16) && (args._Nsize % 4)==0 && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_4x6, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_fp32_mla_6x4, float, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "hybrid_fp32_mla_4x8_normal",
-    [](const GemmArgs &args) { return (args._Ksize >= 4); },
+    "a64_hybrid_fp32_mla_8x4",
+    nullptr,
     [](const GemmArgs &args) { return (args._Nsize < 12); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_4x8, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_8x4, float, float>(args); }
 },
 GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
-    "hybrid_fp32_mla_16x4",
-    [](const GemmArgs &args) { return (args._Ksize >= 4); },
-    [](const GemmArgs &args) { return GemmHybrid<hybrid_fp32_mla_16x4, float, float>::estimate_cycles(args, hybrid_fp32_mla_16x4::get_performance_parameters(args._ci)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_16x4, float, float>(args); }
+    "a64_hybrid_fp32_mla_6x16",
+    nullptr,
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>::estimate_cycles(args, cls_a64_hybrid_fp32_mla_6x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>(args); }
 ),
-
 #ifdef __ARM_FEATURE_SVE
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_fp32_mla_3VLx8",
+    "sve_interleaved_fp32_mla_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>(args); }
 },
 #endif // __ARM_FEATURE_SVE
-// Pretranposed, 2D split
-GemmImplementation<float, float>::with_estimate(
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "sgemm_12x8_2d",
-    nullptr,
-    [](const GemmArgs &args) { return GemmInterleavedPretransposed2d<sgemm_12x8, float, float>::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, float, float>(args); }
-),
-// 1D split (with pretransposed or not)
 GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
-    "sgemm_12x8_1d",
+    "a64_sgemm_8x12",
     nullptr,
-    [](const GemmArgs &args) { return GemmInterleaved<sgemm_12x8, float, float>::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); },
-    [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); }
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, float, float>::estimate_cycles(args, cls_a64_sgemm_8x12::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, float, float>(args); }
 ),
 #endif // __aarch64__
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index 7a983ed6ac..d702cffce1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -77,51 +77,43 @@ class GemmHybrid : public GemmCommon<To, Tr> {
             return args._cfg->inner_block_size;
         }
 
-        const unsigned int L1_size = args._ci->get_L1_cache_size();
+        // Target block size (512 for FP32, scaling for other types).  Don't block until size reaches 1.5X this.
+        unsigned int target_block_size = 2048 / sizeof(To);
 
-        // k_block: Find out how much of the larger array can be loaded into half the cache.
-        // This should account for associative caches.
-        unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+        if (args._Ksize >= ((3 * target_block_size) / 2)) {
+            unsigned int target_blocks = iceildiv(args._Ksize, target_block_size);
 
-        // Needs to be (at least a single) multiple of the K unroll level.
-        k_block /= strategy::k_unroll();
-        k_block = std::max(k_block, 1U) * strategy::k_unroll();
+            unsigned int block_size = iceildiv(args._Ksize, target_blocks);
 
-        // Now tune to presented problem size; this is how many blocks we need.
-        unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
+            block_size = roundup(block_size, strategy::k_unroll());
 
-        // So divide the space equally into that many blocks.
-        k_block = iceildiv(args._Ksize, numk_blocks);
-
-        // And round UP to the K unroll level required.
-        k_block = roundup(k_block, strategy::k_unroll());
+            return block_size;
+        }
 
-        return k_block;
+        return args._Ksize;
     }
 
+    // New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width.  Otherwise do a
+    // single block.
     static unsigned int compute_n_block(const GemmArgs &args) {
         if (args._cfg && args._cfg->outer_block_size) {
             return args._cfg->outer_block_size;
         }
 
-        const unsigned int k_block = compute_k_block(args);
-        const unsigned int L2_size = args._ci->get_L2_cache_size();
-
-        // n_block: Work out how many rows (of length k_block) will fit in the L2
-        // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-        unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
-                                 (sizeof(Toi) * k_block);
+        if (args._Nsize <= 64) {
+            return args._Nsize;
+        }
 
-        // Needs to be (at least a single) multiple of the kernel output width.
-        n_block /= strategy::out_width();
-        n_block = std::max(n_block, 1U) * strategy::out_width();
+        if ((args._Msize / args._Nsize) > 155) {
+            return args._Nsize;
+        }
 
-        // And tune to the presented problem size.
-        unsigned int numblocks = iceildiv(args._Nsize, n_block);
-        n_block = iceildiv(args._Nsize, numblocks);
-        n_block = roundup(n_block, strategy::out_width());
+        // Go slightly wider if thread count and depth are small.
+        if ((args._Ksize <= 128) && (args._maxthreads <= 16)) {
+            return strategy::out_width() * 3;
+        }
 
-        return n_block;
+        return strategy::out_width();
     }
 
 public:
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
new file mode 100644
index 0000000000..eede1a4f76
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -0,0 +1,621 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <alloca.h>
+
+#include <algorithm>
+#include <cassert>
+
+#include "arm_gemm.hpp"
+#include "bias_adder.hpp"
+#include "convolver.hpp"
+#include "ndrange.hpp"
+#include "performance_parameters.hpp"
+#include "transform.hpp"
+#include "utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+#ifndef UNUSED
+#define __I_DEFINED_UNUSED
+#define UNUSED(x)  ((void)(x))
+#endif
+
+namespace arm_gemm {
+
+namespace {
+
+// We need to invoke the kernel differently for quantizing and non-quantizing cases, so here is a shim class to do
+// that.
+
+template<typename OutputStage, bool SeparateQuantize = false>
+class run_hybrid_kernel {
+public:
+    template<typename strategy, typename To, typename Tr>
+    static void run (
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+        const OutputStage &os, const int32_t *col_bias, unsigned int n_0 );
+};
+
+template<>
+template<typename strategy, typename To, typename Tr>
+void run_hybrid_kernel<Nothing, false>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+        const Nothing &, const int32_t *, unsigned int) {
+#ifdef CYCLE_PROFILING
+    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
+#endif
+    UNUSED(kern_k);
+
+    strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, bias_ptr, act, accumulate);
+}
+
+template<>
+template<typename strategy, typename To, typename Tr>
+void run_hybrid_kernel<Requantize32, false>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+        const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
+#ifdef CYCLE_PROFILING
+    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
+#endif
+    UNUSED(kern_k);
+
+    strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, &os, col_bias + n_0, n_0);
+}
+
+template<>
+template<typename strategy, typename To, typename Tr>
+void run_hybrid_kernel<Requantize32, true>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+        const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
+    UNUSED(kern_k);
+    // On this route we will only process one kernel height at a time and will make sure this happens in the driver loop.
+    assert(M <= strategy::out_height());
+    // We don't yet support indirect output (as the quantizer can't do it).
+    assert(output_arg.is_indirect == false);
+
+    // We need a row sum buffer and intermediate output buffer.
+    // These go on the stack as they are not too large, using an automatic array and alloca() respectively.
+    int32_t row_sums[strategy::out_height()];
+    typename strategy::result_type *result_buffer;
+
+    unsigned int output_width = roundup(N, strategy::out_width());
+
+    result_buffer = reinterpret_cast<typename strategy::result_type *>(alloca(output_width * strategy::out_height() * sizeof(typename strategy::result_type)));
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
+#endif
+        // Perform the GEMM, into the output buffer.
+        strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, IndirectOutputArg<typename strategy::result_type>(result_buffer, output_width), nullptr, Activation(), false);
+    }
+
+    if (os.b_offset != 0) {
+#ifdef CYCLE_PROFILING
+        auto p = prof.ScopedProfiler(PROFILE_ROWSUMS, (unsigned long)M * kern_k);
+#endif
+        row_sums_indirect(num_strings, string_ptr, A_arg, M, row_sums, &os);
+    } else {
+        memset(row_sums, 0, sizeof(int32_t) * strategy::out_height());
+    }
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p = prof.ScopedProfiler(PROFILE_QUANTIZE, (unsigned long)M * N);
+#endif
+        // Quantize
+        requantize_block_32(os, N, M, result_buffer, output_width, output_arg.direct.base, output_arg.direct.stride, row_sums, col_bias + n_0, n_0);
+    }
+}
+
+} // anonymous namespace
+
+// Implementation of the GemmCommon abstract class.
+template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false>
+class GemmHybridIndirect : public GemmCommon<To, Tr> {
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type Tri;
+
+    GemmArgs           _args;
+    OutputStage        _os = {};
+
+    /* Quantized support (in addition to 'output stage' above) */
+    int32_t *_col_bias = nullptr;
+
+    const unsigned int _Ktotal;
+    const unsigned int _rounded_Ksize;
+
+    /* Blocking info */
+    const unsigned int _k_block;
+    const unsigned int _n_block;
+    const unsigned int _Mround;
+
+    /* Pretransposed buffer. */
+    const Toi *_B_transposed=nullptr;
+
+    /* Indirect parameters.  _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
+    const To * const * const * _indirect_buf = nullptr;
+
+    /* Convolver - only set up for convolution problems, so also doubles as a flag. */
+    std::unique_ptr<convolver<To>>  _convolver = nullptr;
+
+    // Array of pointers to output rows
+//    Tr * const *        _output_ptrs;
+
+    const NDRange<4> _window_range;
+
+    unsigned int get_col_sum_size() const {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            return _args._Nsize * _args._nmulti * sizeof(int32_t);
+        } else {
+            return 0;
+        }
+    }
+
+    static unsigned int get_ktotal(const GemmArgs &args) {
+        return args._Ksections * roundup(args._Ksize, strategy::k_unroll());
+    }
+
+    static unsigned int compute_k_block(const GemmArgs &args) {
+        // Some kernels don't support accumulate mode - these can't do K blocking at all.
+        if (!strategy::supports_accumulate() || std::is_same<OutputStage, Requantize32>::value) {
+            return get_ktotal(args);
+        }
+
+        if (args._cfg && args._cfg->inner_block_size) {
+            return args._cfg->inner_block_size;
+        }
+
+        // Experimental data suggests an optimal block size of 512 for FP32 (scaling accordingly for other
+        // datatypes); but don't divide into blocks until we hit 1.5X this size.
+        unsigned int target_block_size = 2048 / sizeof(To);
+        auto ktotal = get_ktotal(args);
+
+        if (ktotal > ((target_block_size*3)/2)) {
+            unsigned int target_blocks = iceildiv(ktotal, target_block_size);
+
+            unsigned int block_size = iceildiv(ktotal, target_blocks);
+
+            block_size = roundup(block_size, strategy::k_unroll());
+
+            return block_size;
+        }
+
+        return ktotal;
+    }
+
+    // New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width.  Otherwise do a
+    // single block.
+    static unsigned int compute_n_block(const GemmArgs &args, const OutputStage os = {}) {
+        if (args._cfg && args._cfg->outer_block_size) {
+            return args._cfg->outer_block_size;
+        }
+
+        if (args._Nsize <= 64) {
+            return args._Nsize;
+        }
+
+        if ((args._Msize / args._Nsize) > 155) {
+            return args._Nsize;
+        }
+
+        // "Asymmetric" quantizing GEMMs require a different approach - the tall skinny blocks we would otherwise
+        // use imply a great deal of repeated work performing the row sums.  If row sums are involved, work out how
+        // much "column" parallelism is going to be required and set the block size accordingly.
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&os);
+
+            // Row sums only needed if b_offset isn't 0
+            if (qp->b_offset != 0) {
+                // We can already parallelize across batches, multis and rows (in units of 'out_height')
+                int multi_row_parallelism = args._nmulti * args._nbatches * iceildiv(args._Msize, strategy::out_height());
+
+                // If this isn't enough, we will need to split up the columns too.
+                if (multi_row_parallelism < args._maxthreads) {
+                    unsigned int columns_needed = iceildiv(args._maxthreads, multi_row_parallelism);
+
+                    unsigned int n_block = iceildiv(args._Nsize, columns_needed);
+
+                    return roundup(n_block, strategy::out_width());
+                }
+
+                // Multi/Batch/Row parallelism is enough - don't split up the columns.
+                return args._Nsize;
+            }
+        }
+
+        if (args._Ksize <= 128 && args._maxthreads <= 16) {
+            return strategy::out_width() * 3;
+        }
+
+        return strategy::out_width();
+    }
+
+public:
+    GemmHybridIndirect(GemmHybridIndirect &) = delete;
+    GemmHybridIndirect & operator= (GemmHybridIndirect &) = delete;
+
+    /* Constructor */
+    GemmHybridIndirect(const GemmArgs &args, const OutputStage &os)
+              : _args(args), _os(os), _Ktotal(get_ktotal(args)),
+                _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
+                _k_block(compute_k_block(args)), _n_block(compute_n_block(args, os)),
+                _Mround(roundup(args._Msize, strategy::out_height())),
+                _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
+                              iceildiv(args._Nsize, _n_block), args._nmulti)
+    {
+        // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
+        // GemmConfig.  Clear out the pointer to avoid accidents.
+        _args._cfg = nullptr;
+    }
+
+    /* Constructor without OutputStage */
+    GemmHybridIndirect(const GemmArgs &args)
+              : _args(args), _Ktotal(get_ktotal(args)),
+                _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
+                _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
+                _Mround(roundup(args._Msize, strategy::out_height())),
+                _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
+                              iceildiv(args._Nsize, _n_block), args._nmulti)
+    {
+        // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
+        // GemmConfig.  Clear out the pointer to avoid accidents.
+        _args._cfg = nullptr;
+    }
+
+    // Interface implementation - Compulsory functions
+    ndrange_t get_window_size() const override {
+        return { _window_range.total_size() };
+    }
+
+    // This kernel can always be dynamically scheduled.
+    bool supports_dynamic_scheduling() const override {
+        return true;
+    }
+
+    // Execute
+    void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
+#ifdef CYCLE_PROFILING
+        profiler prof;
+#endif
+        strategy strat(_args._ci);
+
+        std::vector<const To *>         in_row_ptrs;
+        std::vector<const To * const *> in_row_strings;
+        std::vector<unsigned int>       string_lengths;
+
+        // In convolution mode, we need input pointers.
+        if (_convolver) {
+            in_row_ptrs = std::vector<const To *>(strategy::out_height() * _args._Ksections, nullptr);
+            in_row_strings = std::vector<const To * const *>(_args._Ksections, nullptr);
+
+            for (unsigned int i=0; i<_args._Ksections; i++) {
+                in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]);
+            }
+        }
+
+        // In any indirect mode, we need the string lengths.
+        if (_args._indirect_input) {
+            string_lengths = std::vector<unsigned int>(_args._Ksections, 0);
+        }
+
+        /* Make sure we've been set up correctly. */
+        assert(_B_transposed);
+        static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+//        static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
+
+        /* For now, each work item implies all the K for a given output
+         * pixel (so we don't need to synchronize access to the output
+         * array).  So separate the loop over K blocks here.  */
+        for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
+            unsigned int kmax   = std::min(k0 + _k_block, _Ktotal);
+            unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
+
+            const bool first_pass = (k0 == 0);
+            const bool last_pass = (kmax == _Ktotal);
+
+            unsigned int first_section = (k0 / _rounded_Ksize);
+            unsigned int first_offset  = (k0 % _rounded_Ksize);
+            unsigned int kleft = kern_k;
+            unsigned int sections=0;
+            unsigned int offset = first_offset;
+
+            if (_args._indirect_input) {
+                while (kleft) {
+                    // When chopping into sections: the amount that goes into 'string_lengths' is the amount to be
+                    // processed (excluding padding).  But the amount we subtract from 'kleft' takes account of any
+                    // padding applied.
+                    string_lengths[sections] = std::min(kleft, _args._Ksize - offset);
+                    kleft -= std::min(kleft, _rounded_Ksize - offset);
+                    sections++;
+                    offset=0;
+                }
+            }
+
+            auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
+
+            if (p.done()) {
+                return;
+            }
+
+            // Process rows either 'out_height' rows at a time, or do all valid rows at once with a single kernel call.
+            // The separate quantizer path only handles one block of rows at a time (as it has to store sums and intermediate results).
+            // THe convolution path only generates the pointers for one block of rows at a time.
+            const bool process_all_rows = (!SeparateQuantize && !_convolver);
+
+            do {
+                const unsigned int m_start = p.dim(0) * strategy::out_height();
+                const unsigned int m_end   = process_all_rows ? std::min(p.dim0_max() * strategy::out_height(), _args._Msize) : std::min(m_start + strategy::out_height(), _args._Msize);
+//                const unsigned int m_end   = std::min(m_start + strategy::out_height(), _args._Msize);
+                const unsigned int batch   = p.dim(1);
+                const unsigned int n0      = p.dim(2) * _n_block;
+                const unsigned int nmax    = std::min(n0 + _n_block, _args._Nsize);
+                const unsigned int multi   = p.dim(3);
+
+                const Toi *b_panel = _B_transposed +
+                                     (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
+                                     (k0 * roundup(_args._Nsize, strategy::out_width())) +
+                                     (n0 * kern_k);
+
+               IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
+
+#ifdef CYCLE_PROFILING
+                auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
+#endif
+                if (_indirect_buf) {
+                    run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+#ifdef CYCLE_PROFILING
+                                 prof,
+#endif
+                                 strat, sections, string_lengths.data(),
+                                 IndirectInputArg<To>(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset),
+                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+                                 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
+                                 last_pass ? _args._act : Activation(),
+                                 !first_pass,
+                                 // Quantization parameters
+                                 _os, _col_bias+(multi * _args._Nsize), n0);
+                } else if (_convolver) {
+                    auto conv_cols = _convolver->process_columns(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride), this->_lda, k0, kmax, _rounded_Ksize);
+
+                    unsigned int pos=0;
+                    auto conv_rows = conv_cols.process_rows(m_start, m_end - m_start);
+
+                    while (!conv_rows.finished()) {
+                        unsigned int width, conv_offset;
+
+                        assert(pos < sections);
+
+                        std::tie(width, conv_offset) = conv_rows.next_block(&(in_row_ptrs[pos * strategy::out_height()]));
+
+                        if (pos==0) {
+                            assert(conv_offset == first_offset);
+                        }
+                        assert(width == string_lengths[pos]);
+                        pos++;
+                    }
+                    assert(pos == sections);
+
+                    run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+#ifdef CYCLE_PROFILING
+                                 prof,
+#endif
+                                 strat, sections, string_lengths.data(),
+                                 IndirectInputArg<To>(in_row_strings.data(), 0, first_offset),
+                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+                                 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
+                                 last_pass ? _args._act : Activation(),
+                                 !first_pass,
+                                 // Quantization parameters
+                                 _os, _col_bias+(multi * _args._Nsize), n0);
+                } else {
+                    // Length to process.  This needs to exclude padding, but 'kmax' potentially includes it.
+                    const unsigned int len = (std::min(_args._Ksize, kmax) - k0);
+
+                    run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+#ifdef CYCLE_PROFILING
+                                 prof,
+#endif
+                                 strat, 1, &len,
+                                 IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
+                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+                                 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
+                                 last_pass ? _args._act : Activation(),
+                                 !first_pass,
+                                 // Quantization parameters
+                                 _os, _col_bias+(multi * _args._Nsize), n0);
+                }
+            } while (process_all_rows ? p.next_dim1() : p.next_dim0());
+        }
+    }
+
+    // Interface implementation - pretransposed
+    bool B_is_pretransposed() const override {
+        return true;
+    }
+
+    bool B_pretranspose_required() const override {
+        return (_B_transposed==nullptr);
+    }
+
+    size_t get_B_pretransposed_array_size() const override {
+        // Start with actual pretransposed buffer...
+        size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Toi);
+
+        // Space for result row pointers (not strictly needed any more but retained for indirect output testing)
+        size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *);
+
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            size += get_col_sum_size();
+        }
+
+        return size;
+    }
+
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            _col_bias = reinterpret_cast<int32_t *>(in_buffer);
+
+            Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
+
+            for (unsigned int i=0; i<_args._nmulti; i++) {
+                // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size.
+                compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize * _args._Ksections, B + (i * B_multi_stride), ldb, _col_bias + (i * _args._Nsize), _args._Ksize * _args._Ksections, i, 0);
+            }
+        }
+
+        // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+        Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        _B_transposed = buffer;
+
+        strategy strat(_args._ci);
+
+        for (unsigned int multi=0; multi<_args._nmulti; multi++) {
+            for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
+                const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
+
+                /* Figure out the size of each block. */
+                unsigned int k_size = kmax - k0;
+
+                // We need to insert padding at the end of each K section.
+                // The computation needed is a little delicate - the coordinates from the block walker are expressed in
+                // terms of the full, padded, _Ktotal.
+                // But we need to transform each section with reference to the original, unpadded, input, letting the
+                // transform pad each section as needed.
+
+                // This is needed for computations below.
+                const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
+
+                // The expected output format is also an entire <out_width> columns interleaved, then the next set of
+                // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
+                // a time.
+                for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){
+                    unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
+
+                    // Track where we are and how much work is left.
+                    unsigned int kpos  = k0;
+                    unsigned int kleft = k_size;
+
+                    while (kleft) {
+                        // Which section are we in?  Based on the rounded-up section size.
+                        unsigned int k_section_base = kpos / rounded_section_size;
+                        // How far into the section are we?
+                        unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
+
+                        // We will either copy the rest of this section, or to the end of the requested length.
+                        unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
+
+                        strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
+                                                  x0, xmax,
+                                                  (k_section_base * _args._Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
+                                                  (k_section_base * _args._Ksize) + k_offset + k_length);   // K end point - starting point plus length computed above.
+
+                        // We need to modify our position based on the ROUNDED version of what we just did.
+                        unsigned int padded_length = roundup(k_length, strategy::k_unroll());
+
+                        buffer += strategy::out_width() * padded_length;
+
+                        kpos  += padded_length;
+                        kleft -= padded_length;
+                    }
+                }
+            }
+        }
+    }
+
+    void set_pretransposed_B_data(void *in_buffer) override {
+        // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+        _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        _col_bias = reinterpret_cast<int32_t *>(in_buffer);
+    }
+
+    // Estimate cycles for given problem given provided parameters
+    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+        // Note: Current hybrid kernels don't actually round up height (they
+        // have paths for each possible height).  Might need to make this
+        // configurable in future.
+        uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
+
+        float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
+
+        // TODO: A bit of a kludge here: current hybrid kernels incur extra
+        // overhead where the width is not a multiple of kernel width.  It's
+        // most noticable where the overall width is quite low, so add 15%
+        // penalty for such widths.
+        if ((args._Nsize < strategy::out_width()) || (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) {
+            mac_cycles *= 1.15f;
+        }
+
+        uint64_t total_cycles = mac_cycles;
+
+        return total_cycles;
+    }
+
+    void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
+
+            qp->bias = bias;
+            qp->bias_multi_stride = bias_multi_stride;
+        }
+    }
+
+    void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
+        assert(string_len == _args._Ksize);
+        _indirect_buf = ptr;
+    }
+
+    void set_convolution_parameters(ConvolutionParameters parms) override {
+        assert(parms.input_channels == _args._Ksize);
+        _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
+    }
+};
+
+} // namespace arm_gemm
+
+#ifdef __I_DEFINED_UNUSED
+#undef UNUSED
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
index 915227fc29..7a5fa87ee6 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
@@ -118,18 +118,27 @@ class GemmHybridQuantized : public GemmCommon<To, Tr> {
 
         // n_block: Work out how many rows (of length k_block) will fit in the L2
         // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-        unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
-                                 (sizeof(Toi) * k_block);
+        const unsigned int scaled_l2_size = (L2_size * 9) / 10;
+        const unsigned int k_block_area = k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height());
+
+        // .. if the L1 contents is bigger than the L2, just return a minimal size block.
+        if (k_block_area > scaled_l2_size) {
+            return strategy::out_width();
+        }
+
+        unsigned int n_block = (scaled_l2_size - k_block_area) / (sizeof(Toi) * k_block);
 
         // Needs to be (at least a single) multiple of the kernel output width.
         n_block /= strategy::out_width();
-        n_block = std::max(n_block, 1U) * strategy::out_width();
+        n_block = std::max(n_block, 1u) * strategy::out_width();
 
         // And tune to the presented problem size.
         unsigned int numblocks = iceildiv(args._Nsize, n_block);
         n_block = iceildiv(args._Nsize, numblocks);
         n_block = roundup(n_block, strategy::out_width());
 
+        assert(n_block > 0);
+
         return n_block;
     }
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp
new file mode 100644
index 0000000000..7376b5ffe3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2017-2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <assert.h>
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+#include "ndrange.hpp"
+#include "utils.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_gemm {
+
+// Implementation of the GemmCommon abstract class.
+template<typename strategy, typename To, typename Tr>
+class GemmHybridQuantizedInline : public GemmCommon<To, Tr> {
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type Tri;
+
+    /* const properties set by constructor */
+    const CPUInfo * const _ci;
+
+    const unsigned int _Msize;
+    const unsigned int _Nsize;
+    const unsigned int _Ksize;
+
+    const unsigned int _nbatches;
+    const unsigned int _nmulti;
+
+    /* Blocking info */
+    const unsigned int _k_block;
+    const unsigned int _n_block;
+    const unsigned int _Mround;
+
+    /* Pretransposed buffer. */
+    const Toi *_B_transposed=nullptr;
+
+    const NDRange<4> _window_range;
+
+    Requantize32  _qp;
+    int32_t *col_bias = nullptr;
+
+    void *working_space = nullptr;
+
+    unsigned int _nthreads;
+
+    unsigned int get_col_sum_size() const {
+        return _Nsize * _nmulti * sizeof(int32_t);
+    }
+
+    static unsigned int compute_k_block(const GemmArgs &args) {
+        // We don't support K blocks as we only temporarily store 32 bit results.
+        return args._Ksize;
+
+        if (args._cfg && args._cfg->inner_block_size) {
+            return args._cfg->inner_block_size;
+        }
+
+        const unsigned int L1_size = args._ci->get_L1_cache_size();
+
+        // k_block: Find out how much of the larger array can be loaded into half the cache.
+        // This should account for associative caches.
+        unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+        // Needs to be (at least a single) multiple of the K unroll level.
+        k_block /= strategy::k_unroll();
+        k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+        // Now tune to presented problem size; this is how many blocks we need.
+        unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
+
+        // So divide the space equally into that many blocks.
+        k_block = iceildiv(args._Ksize, numk_blocks);
+
+        // And round UP to the K unroll level required.
+        k_block = roundup(k_block, strategy::k_unroll());
+
+        return k_block;
+    }
+
+    static unsigned int compute_n_block(const GemmArgs &args) {
+        if (args._cfg && args._cfg->outer_block_size) {
+            return args._cfg->outer_block_size;
+        }
+
+        const unsigned int k_block = compute_k_block(args);
+        const unsigned int L2_size = args._ci->get_L2_cache_size();
+
+        // n_block: Work out how many rows (of length k_block) will fit in the L2
+        // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+        unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+                                 (sizeof(Toi) * k_block);
+
+        // Needs to be (at least a single) multiple of the kernel output width.
+        n_block /= strategy::out_width();
+        n_block = std::max(n_block, 1U) * strategy::out_width();
+
+        // And tune to the presented problem size.
+        unsigned int numblocks = iceildiv(args._Nsize, n_block);
+        n_block = iceildiv(args._Nsize, numblocks);
+        n_block = roundup(n_block, strategy::out_width());
+
+        return n_block;
+    }
+
+public:
+    GemmHybridQuantizedInline(GemmHybridQuantizedInline &) = delete;
+    GemmHybridQuantizedInline & operator= (GemmHybridQuantizedInline &) = delete;
+
+    /* Constructor */
+    GemmHybridQuantizedInline(const GemmArgs &args, const Requantize32 &qp)
+              : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+                _nbatches(args._nbatches), _nmulti(args._nmulti),
+                _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
+                _Mround(roundup(args._Msize, strategy::out_height())),
+                _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti),
+                _qp (qp), _nthreads(args._maxthreads) { }
+
+    // Interface implementation - Compulsory functions
+    ndrange_t get_window_size() const override {
+        return { _window_range.total_size() };
+    }
+
+    // This kernel can always be dynamically scheduled.
+    bool supports_dynamic_scheduling() const override {
+        return true;
+    }
+
+    // Execute
+    void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
+#ifdef CYCLE_PROFILING
+        profiler prof;
+#endif
+        strategy strat(_ci);
+
+        /* Make sure we've been set up correctly. */
+        assert(_B_transposed);
+        static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+
+        /* For now, each work item implies all the K for a given output
+         * pixel (so we don't need to synchronize access to the output
+         * array).  So separate the loop over K blocks here.  */
+        for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
+            unsigned int kmax   = std::min(k0 + _k_block, _Ksize);
+            unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
+
+            auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
+
+            if (p.done()) {
+                return;
+            }
+
+            do {
+                const unsigned int m_start = p.dim(0) * strategy::out_height();
+                const unsigned int m_end   = std::min(p.dim0_max() * strategy::out_height(), _Msize);
+                const unsigned int batch   = p.dim(1);
+                const unsigned int n0      = p.dim(2) * _n_block;
+                const unsigned int nmax    = std::min(n0 + _n_block, _Nsize);
+                const unsigned int multi   = p.dim(3);
+
+                const Toi *b_panel = _B_transposed +
+                                     (multi * roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll())) +
+                                     (k0 * roundup(_Nsize, strategy::out_width())) +
+                                     (n0 * kern_k);
+
+                {
+#ifdef CYCLE_PROFILING
+                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
+#endif
+                    strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,
+                                 b_panel,
+                                 this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
+                                 (m_end - m_start), (nmax - n0), kmax - k0,
+                                 col_bias + (multi * _Nsize) + n0, _qp);
+                }
+            } while (p.next_dim1());
+        }
+    }
+
+    // Interface implementation - pretransposed
+    bool B_is_pretransposed() const override {
+        return true;
+    }
+
+    bool B_pretranspose_required() const override {
+        return (_B_transposed==nullptr);
+    }
+
+    size_t get_B_pretransposed_array_size() const override {
+        return get_col_sum_size() + (roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi));
+    }
+
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+        col_bias = reinterpret_cast<int32_t *>(in_buffer);
+
+        for (unsigned int i=0; i<_nmulti; i++) {
+            compute_col_sums(_qp, _Nsize, _Ksize, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize),  _Ksize, i, 0);
+        }
+
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+        Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        _B_transposed = buffer;
+        strategy strat(_ci);
+
+        for (unsigned int multi=0; multi<_nmulti; multi++) {
+            for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
+                const unsigned int kmax = std::min(k0 + _k_block, _Ksize);
+                const unsigned int k_size = roundup(kmax-k0, strategy::k_unroll());
+
+                for (unsigned int x0=0; x0<_Nsize; x0+=_n_block) {
+                    const unsigned int xmax = std::min(x0+_n_block, _Nsize);
+
+                    const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;
+
+                    strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
+                                               x0, xmax, k0, kmax);
+
+                    buffer += size;
+                }
+            }
+        }
+    }
+
+    void set_pretransposed_B_data(void *in_buffer) override {
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+        _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        col_bias = reinterpret_cast<int32_t *>(in_buffer);
+    }
+
+    void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
+        _qp.bias = bias;
+        _qp.bias_multi_stride = bias_multi_stride;
+    }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index 261e7d2d9c..f6a0fc5d52 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -37,9 +37,9 @@ template<typename Top, typename Tret, class OutputStage = Nothing>
 struct GemmImplementation {
     const GemmMethod                                                               method;
     const char *                                                                   name;
-    std::function<bool(const GemmArgs &, const OutputStage &)>                     is_supported;
-    std::function<uint64_t(const GemmArgs &, const OutputStage &)>                 cycle_estimate;
-    std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)>  instantiate;
+    std::function<bool(const GemmArgs &, const OutputStage &)>                     is_supported = {};
+    std::function<uint64_t(const GemmArgs &, const OutputStage &)>                 cycle_estimate = {};
+    std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)>  instantiate = {};
 
     bool do_is_supported(const GemmArgs &args, const OutputStage &os) const {
         if (is_supported != nullptr) {
@@ -57,13 +57,13 @@ struct GemmImplementation {
         }
     }
 
-    GemmImplementation(const GemmImplementation &) = default;
-    GemmImplementation &operator= (const GemmImplementation &) = default;
-
     GemmCommon<Top, Tret> *do_instantiate(const GemmArgs &args, const OutputStage &os) const {
         return instantiate(args, os);
     }
 
+    GemmImplementation(const GemmImplementation &) = default;
+    GemmImplementation & operator= (const GemmImplementation &) = default;
+
     GemmImplementation(GemmMethod m, const char *n,
                        std::function<bool(const GemmArgs &, const OutputStage &)> is_supported, std::function<bool(const GemmArgs &, const OutputStage &)> is_recommended,
                        std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate) :
@@ -79,9 +79,9 @@ template<typename Top, typename Tret>
 struct GemmImplementation<Top, Tret, Nothing> {
     const GemmMethod                                          method;
     const char *                                              name;
-    std::function<bool(const GemmArgs &)>                     is_supported;
-    std::function<uint64_t(const GemmArgs &)>                 cycle_estimate;
-    std::function<GemmCommon<Top, Tret> *(const GemmArgs &)>  instantiate;
+    std::function<bool(const GemmArgs &)>                     is_supported = {};
+    std::function<uint64_t(const GemmArgs &)>                 cycle_estimate = {};
+    std::function<GemmCommon<Top, Tret> *(const GemmArgs &)>  instantiate = {};
 
     bool do_is_supported(const GemmArgs &args, const Nothing &) const {
         if (is_supported != nullptr) {
@@ -103,7 +103,6 @@ struct GemmImplementation<Top, Tret, Nothing> {
         return instantiate(args);
     }
 
-
     static GemmImplementation with_estimate(GemmMethod m, const char *n,
                        std::function<bool(const GemmArgs &)> is_supported, std::function<uint64_t(const GemmArgs &)> cycle_estimate,
                        std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) {
@@ -116,7 +115,10 @@ struct GemmImplementation<Top, Tret, Nothing> {
         return impl;
     }
 
-    GemmImplementation(GemmMethod m, const char * n) : method(m), name(n), is_supported(nullptr), cycle_estimate(nullptr), instantiate(nullptr) {}
+    GemmImplementation(const GemmImplementation &) = default;
+    GemmImplementation & operator= (const GemmImplementation &) = default;
+
+    GemmImplementation(GemmMethod m, const char * n) : method(m), name(n) {}
 
     GemmImplementation(GemmMethod m, const char *n,
                        std::function<bool(const GemmArgs &)> is_supported, std::function<bool(const GemmArgs &)> is_recommended,
@@ -124,9 +126,6 @@ struct GemmImplementation<Top, Tret, Nothing> {
                        method(m), name(n), is_supported(is_supported),
                        cycle_estimate( [is_recommended](const GemmArgs &args) -> uint64_t { return (is_recommended == nullptr) ? 0 : (is_recommended(args) ? 0 : UINT64_MAX); } ),
                        instantiate(instantiate) {   }
-
-    GemmImplementation(const GemmImplementation &) = default;
-    GemmImplementation &operator=(const GemmImplementation &) = default;
 };
 
 /* "Master" function implemented for each valid combination of types.
@@ -211,6 +210,7 @@ std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, cons
 
     for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
         /* Check that this implementation supports the presented problem. */
+
         if (!i->do_is_supported(args, os)) {
             continue;
         }
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index da682330a0..a3a61959c3 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -28,17 +28,17 @@
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
 
-#include "kernels/a64_gemm_s16_12x8.hpp"
+#include "kernels/a64_gemm_s16_8x12.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<int16_t, int32_t> gemm_s16_methods[] = {
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_s16_12x8",
+    "a64_gemm_s16_8x12",
     nullptr,
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s16_8x12, int16_t, int32_t>(args); }
 },
 {
     GemmMethod::DEFAULT,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 147caeefbd..31f225002e 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -26,21 +26,22 @@
 #include "arm_gemm.hpp"
 #include "gemm_common.hpp"
 #include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
-#include "gemm_interleaved_pretransposed_2d.hpp"
 
-#include "kernels/a64_gemm_s16_12x8.hpp"
-#include "kernels/a64_gemm_s8_12x8.hpp"
+#include "kernels/a64_gemm_s16_8x12.hpp"
+#include "kernels/a64_gemm_s8_8x12.hpp"
 #include "kernels/a64_gemm_s8_4x4.hpp"
-#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp"
-#include "kernels/a64_interleaved_s8s32_mmla_12x8.hpp"
-#include "kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp"
-#include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp"
-#include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
-#include "kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp"
-#include "kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp"
+#include "kernels/a64_hybrid_s8s32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp"
+
+#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp"
+#include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp"
 
 namespace arm_gemm {
 
@@ -49,106 +50,84 @@ static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
 #ifdef MMLA_INT8
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_s8s32_mmla_3VLx8",
+    "sve_interleaved_s8s32_mmla_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>8); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_s8s32_mmla_3VLx8, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int32_t>(args); }
 },
 #endif
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_s8s32_dot_1VLx8",
-    [](const GemmArgs &args) { return args._Ksize<=64; },
+    "sve_smallK_hybrid_s8s32_dot_8x1VL",
+    [](const GemmArgs &args) { return args._Ksize<=64 && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_s8s32_dot_1VLx8, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_s8s32_dot_8x1VL, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "hybrid_s8s32_dot_4VLx4",
+    "sve_hybrid_s8s32_dot_6x4VL",
     [](const GemmArgs &args) { return args._Ksize>=16; },
     [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_s8s32_dot_4VLx4, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_s8s32_dot_3VLx8",
+    "sve_interleaved_s8s32_dot_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_s8s32_dot_3VLx8, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int32_t>(args); }
 },
-#endif
+#endif // SVE
 #ifdef MMLA_INT8
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_s8s32_mmla_12x8",
+    "a64_interleaved_s8s32_mmla_8x12",
     [](const GemmArgs &args) { return (args._Ksize>8); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_s8s32_mmla_12x8, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int32_t>(args); }
 },
 #endif
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_s8s32_dot_4x8",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
+    "a64_smallK_hybrid_s8s32_dot_8x4",
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_s8s32_dot_4x8, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_s8s32_dot_8x4, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_s8s32_dot_4x6",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
+    "a64_smallK_hybrid_s8s32_dot_6x4",
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_s8s32_dot_4x6, int8_t, int32_t>(args); }
-},
-{
-    GemmMethod::GEMM_HYBRID,
-    "hybrid_s8s32_dot_16x4",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16; },
-    [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_s8s32_dot_16x4, int8_t, int32_t>(args); }
-},
-{
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "gemm_s8_12x8_2d",
-    [](const GemmArgs &args) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8); },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_s8_12x8, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_s8s32_dot_6x4, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_s8_12x8_1d",
-    [](const GemmArgs &args) { return args._ci->has_dotprod(); },
+    "a64_gemm_s16_8x12",
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Ksize>4; },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s16_8x12, int8_t, int32_t>(args); },
 },
 {
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "gemm_s16_12x8_2d",
-    nullptr,
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4 && (args._Msize / args._maxthreads) < 8; },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_s16_12x8, int8_t, int32_t>(args); },
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8s32_dot_6x16",
+    [](const GemmArgs &args) { return args._ci->has_dotprod(); },
+    [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_s16_12x8_1d",
-    nullptr,
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4; },
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_s16_12x8, int8_t, int32_t>(args); },
-},
-{
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "gemm_s8_4x4_2d",
+    "a64_gemm_s8_8x12",
+    [](const GemmArgs &args) { return args._ci->has_dotprod(); },
     nullptr,
-    [](const GemmArgs &args) { return ((args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8)) ||
-                                       ((args._Msize / args._maxthreads) < 4); },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_s8_4x4, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_8x12, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_s8_4x4_1d",
+    "a64_gemm_s8_4x4",
     nullptr,
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_4x4, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::DEFAULT,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index c4dceef922..92c1086a5f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -27,11 +27,12 @@
 #include <cassert>
 
 #include "arm_gemm.hpp"
-#include "utils.hpp"
-
+#include "convolver.hpp"
 #include "mergeresults.hpp"
 #include "performance_parameters.hpp"
+#include "quantized.hpp"
 #include "transform.hpp"
+#include "utils.hpp"
 
 #ifdef CYCLE_PROFILING
 #include "profiler.hpp"
@@ -46,12 +47,212 @@
 //
 // This implementation interleaves the source matrices in blocks - good for
 // larger matrices.
+
 namespace arm_gemm {
 
-template<typename strategy, typename To, typename Tr>
+namespace {
+
+// Some kernels output to a linear buffer and require a separate merge step.
+// Others output directly to the matrix result.  This helper class calls the
+// appropriate functions, using templating to avoid calling non-existent
+// functions.
+template<bool MergeStep, typename OutputStage>
+class kernel_and_merge {
+public:
+    template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+    static void run (
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+        unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
+        const Activation &act, bool accumulate, const OutputStage &os, const int32_t *col_bias,
+        Tab *acc_buff);
+};
+
+// Run a kernel and call the separate merge step
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<true, Nothing>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+        unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
+        const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *)
+{
+    const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+#endif
+
+        strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+    }
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
+#endif
+        strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act, accumulate);
+    }
+}
+
+// Run a kernel with integrated merge
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<false, Nothing>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
+        unsigned int n_0, unsigned int n_max, const Tr *biasptr,
+        const Activation &act, bool accumulate, const Nothing &, const int32_t *,
+        Tab *acc_buff)
+{
+#ifdef CYCLE_PROFILING
+    auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
+#endif
+
+    // We need to offset the C pointer, but as it might be NULL (requesting output to accumulation buffer) we need
+    // to be careful not to offset a null pointer.
+    Tri *offset_c_ptr;
+
+    if (c_ptr == nullptr) {
+        offset_c_ptr = nullptr;
+    } else {
+        offset_c_ptr = c_ptr + m_0 * ldc + n_0;
+    }
+
+    strat.kernel(// A and B pointers are just the packed panels.
+                 a_ptr, b_panel,
+                 // Provide relevant part of output array and row stride.
+                 offset_c_ptr, ldc,
+                 // M, N, K sizes
+                 m_max-m_0, n_max - n_0, kern_k,
+                 // Bias, activation, accumulation.  Need to offset the bias as needed.
+                 biasptr ? biasptr + n_0 : nullptr, act, accumulate,
+                 // Accumulation buffer.
+                 acc_buff );
+}
+
+// Run a kernel with integrated merge, quantizing
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<false, Requantize32>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
+        unsigned int n_0, unsigned int n_max, const Tr *,
+        const Activation &, bool accumulate, const Requantize32 &qp, const int32_t *col_bias,
+        Tab *acc_buff)
+{
+#ifdef CYCLE_PROFILING
+    auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
+#endif
+
+    strat.kernel(// A and B pointers are just the packed panels.
+                 a_ptr, b_panel,
+                 // Provide relevant part of output array and row stride.
+                 c_ptr + m_0 * ldc + n_0, ldc,
+                 // M, N, K sizes
+                 m_max-m_0, n_max - n_0, kern_k,
+                 // Bias, activation, accumulation.  Need to offset the bias as needed.
+                 col_bias + n_0, qp, n_0, accumulate, acc_buff);
+}
+
+// Run a kernel and call the separate quantize step
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<true, Requantize32>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+        unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *,
+        const Activation &, bool, const Requantize32 &qp, const int32_t *col_bias,
+        Tab *)
+{
+    const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+#endif
+
+        strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+    }
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
+#endif
+        // The interleaved kernel outputs in blocks - each block is a
+        // row-major matrix of size out_width * out_height.  The merge
+        // kernels are designed to deal with this but the requantizer is
+        // not, so we need to requantize one block at a time.
+        for (int i=0; i<bblocks; i++) {
+            unsigned int n_start = n_0 + (strategy::out_width() * i);
+            unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
+
+            // The row bias is interleaved with the transposed A data, get a pointer to it here.
+            const int32_t *row_bias = reinterpret_cast<const int32_t *>(a_ptr + strategy::out_height() * kern_k);
+
+            requantize_block_32(qp, (n_end - n_start), (m_max-m_0),
+                                c_panel + (i * strategy::out_width() * strategy::out_height()), strategy::out_width(),
+                                c_ptr + m_0 * ldc + n_start, ldc,
+                                row_bias, col_bias + n_start, n_start);
+        }
+    }
+}
+
+// Integer GEMMs can be used in two contexts - "normal" where the full 32-bit output is required, or in
+// "requantizing" context where the output will be requantized.
+//
+// These require different input transforms, as if we are requantizing we want to sum the rows of the A input, and
+// if we are not we don't.
+//
+// This helper class allows the appropriate transforms to be found, without requiring kernels that don't support
+// quantization to define useless "quantized" transforms.
+template<typename strategy, bool quantized>
+class transform_type {
+public:
+    typedef decltype(strategy::transforms) type;
+};
+
+template<typename strategy>
+class transform_type<strategy, true> {
+public:
+    typedef decltype(strategy::transforms_quantized) type;
+};
+
+// We need a similar trick here to figure out what type the accumulator buffer should be.
+template<typename strategy, typename OutputStage>
+class accumulate_buffer_type {
+public:
+    typedef typename strategy::result_type type;
+};
+
+template<typename strategy>
+class accumulate_buffer_type<strategy, Requantize32> {
+public:
+    typedef int32_t type;
+};
+
+} // anonymous namespace
+
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool ForceThreadColumns=false>
 class GemmInterleaved : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
     typedef typename strategy::result_type Tri;
+    typedef typename accumulate_buffer_type<strategy, OutputStage>::type Tab;
 
     /* const properties set by constructor */
     const CPUInfo * const _ci;
@@ -59,10 +260,15 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
     const unsigned int _Msize;
     const unsigned int _Nsize;
     const unsigned int _Ksize;
+    const unsigned int _Ksections;
+    const unsigned int _Ktotal;
+    const unsigned int _rounded_Ksize;
 
     const unsigned int _nbatches;
     const unsigned int _nmulti;
 
+    const bool _thread_columns;
+
     const Activation _act;
 
     const int _maxthreads;
@@ -77,30 +283,59 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
     const Toi *_B_transposed=nullptr;
     void *_working_space=nullptr;
 
+    Tab *_accumulation_buffer=nullptr;
+
+    /* Output stage */
+    OutputStage  _os;
+
+    /* Quantized support (in addition to 'output stage' above */
+    int32_t *col_bias = nullptr;
+
+    /* Indirect parameters.  _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
+    const To * const * const * _indirect_buf = nullptr;
+
+    /* Convolver - only set up for convolution problems, so also doubles as a flag. */
+    std::unique_ptr<convolver<To>>  _convolver = nullptr;
+
+    unsigned int get_col_sum_size() const {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            return _Nsize * _nmulti * sizeof(int32_t);
+        } else {
+            return 0;
+        }
+    }
+
     /* We will need to walk through the blocks of B in a few contexts, so
      * factor that out.  */
     class blockwalker {
     private:
         /* Size loops, etc. based on our parent's configuration */
-        const GemmInterleaved<strategy, To, Tr> &_parent;
+        const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &_parent;
 
         /* K, X and multi parameters for current iteration. */
         unsigned int _k0=0, _x0=0, _multi=0;
 
+        /* Range of X to iterate over - used in "ForceThreadColumns" cases */
+        unsigned int _x_start=0;
+        unsigned int _x_end=_parent._Nsize;
+
         unsigned int _index=0;
         bool _done=false;
         bool _newkblock=true;
         bool _newmulti=true;
 
     public:
-        blockwalker(const GemmInterleaved<strategy, To, Tr> &parent) : _parent(parent) { }
+        blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent) : _parent(parent) { }
+
+        blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent,
+                    unsigned int x_start, unsigned int x_end) : _parent(parent), _x0 (_x_start), _x_start(x_start), _x_end(x_end) { }
 
         unsigned int xmax() {
-            return std::min(_x0 + _parent._x_block, _parent._Nsize);
+            return std::min(_x0 + _parent._x_block, _x_end);
         }
 
         unsigned int kmax() {
-            return std::min(_k0 + _parent._k_block, _parent._Ksize);
+            return std::min(_k0 + _parent._k_block, _parent._Ktotal);
         }
 
         /* Advance to the next block, return false at the end. */
@@ -111,10 +346,10 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
 
             _newkblock=false;
             _x0 += _parent._x_block;
-            if (_x0 >= _parent._Nsize) {
-                _x0=0;
+            if (_x0 >= _x_end) {
+                _x0=_x_start;
                 _k0 += _parent._k_block;
-                if (_k0 >= _parent._Ksize) {
+                if (_k0 >= _parent._Ktotal) {
                     _k0=0;
                     _multi++;
                     if (_multi >= _parent._nmulti) {
@@ -138,14 +373,125 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
         bool newkblock(void) { return _newkblock; }
     };
 
-    // A working size: One of these needed, regardless of thread count.  Divided according to window.
+    // "k block" has two distinct uses: figuring out which iterations of K
+    // to actually process, but also various size/pointer computations.  The
+    // latter needs to take account of the extra space needed for the row
+    // sums, if appropriate.
+    unsigned int get_total_k_depth() const {
+        unsigned int k_depth = _k_block;
+
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            k_depth += sizeof(int32_t) / sizeof(Toi);
+        }
+
+        return k_depth;
+    }
+
+    // A working size.
     size_t get_a_working_size() const {
-        return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches);
+        if (_thread_columns) {
+            // For 2D threading: allocate a buffer of one block of rows per thread
+            return ROUND_UP(sizeof(Toi) * get_total_k_depth() * strategy::out_height() * _maxthreads);
+        } else {
+            // For 1D threaded: one of these needed, regardless of thread count.  Divided according to window.
+            return ROUND_UP(sizeof(Toi) * get_total_k_depth() * _Mround * _nbatches);
+        }
     }
 
-    // C working size: One needed per thread.
+    // C working size: One needed per thread.  Not needed if there is no merge step.
     size_t get_c_working_size() const {
-        return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
+        if (MergeStep) {
+            return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
+        } else {
+            return 0;
+        }
+    }
+
+    // Accumulation buffer size
+    size_t get_accumulation_buffer_size() const {
+        // We only support an accumulation buffer for non-merge cases.
+        if (MergeStep) {
+            return 0;
+        }
+
+        // Check if we are actually blocking
+        if (_k_block == _Ktotal) {
+            return 0;
+        }
+
+        // We are no-merge, non-quantized with active blocking: accumulation buffer needed.
+        size_t size_per_buffer = sizeof(Tab) * strategy::out_height() * strategy::out_width();
+        size_t num_buffers = iceildiv(_Msize, strategy::out_height()) * iceildiv(_Nsize, strategy::out_width()) * _nbatches * _nmulti;
+
+        return num_buffers * size_per_buffer;
+    }
+
+    // Get pointer into accumulation buffer
+    Tab *get_accumulation_buffer(unsigned int M, unsigned int N, unsigned int batch, unsigned int multi) const {
+        // Don't do anything if there's no buffer.
+        if (_accumulation_buffer == nullptr) {
+            return nullptr;
+        }
+
+        // Here we are indexing an appropriately sized pointer, so no sizeof() needed to convert to bytes.
+        size_t size_per_buffer = strategy::out_height() * strategy::out_width();
+
+        size_t buffer_rows = iceildiv(_Msize, strategy::out_height());
+        size_t buffer_cols = iceildiv(_Nsize, strategy::out_width());
+        size_t buffers_per_batch = (buffer_rows * buffer_cols);
+        size_t buffers_per_multi = buffers_per_batch * _nbatches;
+
+        // M/N must reference the top-left corner of a block.
+        size_t row = M / strategy::out_height();
+        assert(M % strategy::out_height() == 0);
+        size_t col = N / strategy::out_width();
+        assert(N % strategy::out_width() == 0);
+
+        size_t buffer_index = multi * buffers_per_multi + batch * buffers_per_batch + row * buffer_cols + col;
+
+        return _accumulation_buffer + (buffer_index * size_per_buffer);
+    }
+
+    int32_t row_sum_multiplier() const {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&_os);
+
+            return -qp->b_offset;
+        }
+
+        return 0;
+    }
+
+    // Heuristics to decide whether to use the 'thread columns' regime
+    static bool is_thread_columns(const GemmArgs &args) {
+        // For now, there is a templace parameter to force it.
+        if (ForceThreadColumns) {
+            return true;
+        }
+
+        // Never do this for single threaded cases.
+        if (args._maxthreads == 1) {
+            return false;
+        }
+
+        // How many blocks of work are available for threading on M?
+        int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches;
+
+        // If we just can't share the work across threads with the row threading regime.
+        if (args._maxthreads > m_blocks) {
+            return true;
+        }
+
+        // If the row threading regime is too wasteful (20% threshold)
+        if (((roundup(m_blocks, args._maxthreads) * 100) / m_blocks) > 120) {
+            return true;
+        }
+
+        return false;
+    }
+
+    static unsigned int get_ktotal(const GemmArgs &args) {
+        return args._Ksections * roundup(args._Ksize, strategy::k_unroll());
     }
 
     static unsigned int get_k_block_size(const GemmArgs &args) {
@@ -153,6 +499,11 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
             return args._cfg->inner_block_size;
         }
 
+        // K blocking not supported if we are requantizing.
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            return get_ktotal(args);
+        }
+
         const unsigned int L1_size = args._ci->get_L1_cache_size();
         unsigned int k_block;
 
@@ -165,58 +516,84 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
         k_block = std::max(k_block, 1U) * strategy::k_unroll();
 
         // Now tune to presented problem size; this is how many blocks we need.
-        unsigned int num_k_blocks = iceildiv(args._Ksize, k_block);
+        unsigned int num_k_blocks = iceildiv(get_ktotal(args), k_block);
 
         // So divide the space equally into that many blocks.
-        k_block = iceildiv(args._Ksize, num_k_blocks);
+        k_block = iceildiv(get_ktotal(args), num_k_blocks);
 
         // And round UP to the K unroll level required.
         k_block = roundup(k_block, strategy::k_unroll());
 
+        assert(k_block > 0);
+
         return k_block;
     }
 
-public:
-    GemmInterleaved(GemmInterleaved &) = delete;
-    GemmInterleaved & operator= (GemmInterleaved &) = delete;
-
-    /* Constructor */
-    GemmInterleaved(const GemmArgs &args)
-                    : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
-                      _nbatches(args._nbatches), _nmulti(args._nmulti),
-                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
-                      _k_block(get_k_block_size(args)) {
-        const unsigned int L2_size = _ci->get_L2_cache_size();
-
-        assert(_maxthreads > 0);
+    static unsigned int get_x_block_size(const GemmArgs &args) {
+        if (is_thread_columns(args)) {
+            // In 2D mode, override X block, because we will process width first.
+            return roundup(args._Nsize, strategy::out_width());
+        }
 
-        // Work out blocking parameters, or override from provided GemmConfig
-        // TODO: Move outer block into a static function too.
         if (args._cfg && args._cfg->outer_block_size) {
-            _x_block = args._cfg->outer_block_size;
-        } else {
-            // x_block: Work out how many rows (of length k_block) will fit in the L2
-            // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-            _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
-                      (sizeof(Toi) * _k_block);
+            return roundup(args._cfg->outer_block_size, strategy::out_width());
+        }
 
-            // Needs to be (at least a single) multiple of the kernel output width.
-            _x_block /= strategy::out_width();
-            _x_block = std::max(_x_block, 1U) * strategy::out_width();
+        unsigned int x_block;
+        const unsigned int L2_size = args._ci->get_L2_cache_size();
+        const unsigned int k_block = get_k_block_size(args);
 
-            // And tune to the presented problem size.
-            unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
-            _x_block = iceildiv(_Nsize, num_x_blocks);
+        // x_block: Work out how many rows (of length k_block) will fit in the L2
+        // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+        const unsigned int scaled_l2_size = (L2_size * 9) / 10;
+        const unsigned int k_block_area = k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height());
 
-            _x_block = iceildiv(_x_block, strategy::out_width());
-            _x_block *= strategy::out_width();
+        // .. if the L1 contents is bigger than the L2, just return a minimal size block.
+        if (k_block_area > scaled_l2_size) {
+            return strategy::out_width();
         }
 
-        // Work out the rounded size of M - needed for some buffers.
-        _Mround = iceildiv(_Msize, strategy::out_height());
-        _Mround *= strategy::out_height();
+        x_block = (scaled_l2_size - k_block_area) / (sizeof(Toi) * k_block);
+
+        // Needs to be (at least a single) multiple of the kernel output width.
+        x_block /= strategy::out_width();
+        x_block = std::max(x_block, 1u) * strategy::out_width();
+
+        // And tune to the presented problem size.
+        unsigned int num_x_blocks = iceildiv(args._Nsize, x_block);
+        x_block = iceildiv(args._Nsize, num_x_blocks);
+
+        x_block = roundup(x_block, strategy::out_width());
+
+        assert(x_block > 0);
+
+        return x_block;
     }
 
+public:
+    GemmInterleaved(GemmInterleaved &) = delete;
+    GemmInterleaved & operator= (GemmInterleaved &) = delete;
+
+    /* Constructor */
+    GemmInterleaved(const GemmArgs &args, const OutputStage &os)
+                    : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+                      _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
+                      _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
+                      _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
+                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+                      _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
+                      _os(os) { }
+
+    /* Constructor without OutputStage */
+    GemmInterleaved(const GemmArgs &args)
+                    : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+                      _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
+                      _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
+                      _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
+                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+                      _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
+                      _os() { }
+
     // Interface implementation - Compulsory functions
 
     // Window size: Only the last thread should do a ragged block, so dole
@@ -224,8 +601,14 @@ public:
     // not multi for now (as this would cause problems with the buffer
     // manager).
     ndrange_t get_window_size() const override {
-        // _Mround is a multiple of out_height by definition.
-        return { (_Mround / strategy::out_height()) * _nbatches };
+        unsigned int row_blocks = (_Mround / strategy::out_height()) * _nbatches;
+
+        if (_thread_columns) {
+            return { row_blocks, iceildiv(_Nsize, strategy::out_width()) };
+        } else {
+            // _Mround is a multiple of out_height by definition.
+            return { row_blocks };
+        }
     }
 
     // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
@@ -235,117 +618,262 @@ public:
 
     // Execute
     void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override {
-        const auto start = work_range.get_position(0);
-        const auto end   = work_range.get_position_end(0);
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
+
+        /* Make sure we've been set up correctly. */
+        assert(_B_transposed);
+        assert(_working_space);
+        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+
+        /* Align if needed */
+        intptr_t working_space_v = reinterpret_cast<intptr_t>(_working_space);
+        if (working_space_v & 0x3f) {
+            intptr_t alignment_offset = 0x40 - (working_space_v & 0x3f);
+            working_space_bytes += alignment_offset;
+        }
+
         strategy strat(_ci);
 
-        blockwalker current(*this);
+        const auto start = work_range.get_position(0);
+        const auto end   = work_range.get_position_end(0);
 
         /* Translate 'start' and 'end' into a position within the batches and rows. */
         const unsigned int window_per_batch = _Mround / strategy::out_height();
         unsigned int batch_0   = start / window_per_batch;
         unsigned int batch_end = end   / window_per_batch;
 
-        /* Compute the M values to operate on */
-        unsigned int m_0   = (start - (batch_0 * window_per_batch)) * strategy::out_height();
-        unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
+        // In ThreadColumns mode, process work one horizontal strip at a time.
+        // Transpose the block of needed rows at the start, then do all the work on that block.
+        if (_thread_columns) {
+            const auto start_x = work_range.get_position(1) * strategy::out_width();
+            const auto end_x = std::min(work_range.get_position_end(1) * strategy::out_width(), _Nsize);
 
-        /* Make sure we've been set up correctly. */
-        assert(_B_transposed);
-        assert(_working_space);
-        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+            Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
+            Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()) +
+                                       (threadid * sizeof(Toi) * get_total_k_depth() * strategy::out_height()));
 
-        // Private buffers.  Treat working_space as an array of C buffers
-        // (one per thread) first, followed by the (window-divided) A
-        // buffer.
-        // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
-        Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
-        Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
+            for (unsigned int multi=0; multi<_nmulti; multi++) {
+                for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
+                    unsigned int kmax=std::min(k0+_k_block, _Ktotal);
 
-        const Toi *b_panel;
-        b_panel = _B_transposed;
+                    unsigned int rounded_width = roundup(_Nsize, strategy::out_width());
 
-        //printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block);
+                    const bool first_pass = (k0==0);
+                    const bool last_pass  = (kmax==_Ktotal);
 
-        // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
-        int kern_k = 0;
+                    // Figure out how many "K" the kernel will actually process.
+                    unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll());
 
-        for (;!current.done();current.advance()) {
-            if (current.newkblock()) {
-#ifdef CYCLE_PROFILING
-                auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));
-#endif
-                for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
-                    unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
-                    unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+                    const Toi *b_ptr = _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
 
-                    if (first_m >= last_m)
-                        continue;
+                    unsigned int batch     = batch_0;
+                    unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height();
 
-                    strat.transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * _k_block),
-                                              this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
-                                              this->_lda, first_m, last_m, current.k0(), current.kmax());
-                }
+                    for (unsigned int p=start; p<end; p++) {
+                        unsigned int end_row = std::min(start_row + strategy::out_height(), _Msize);
 
-                // Figure out how many "K" the kernel will actually process.
-                kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
-                kern_k *= strat.k_unroll();
+                        // Set up transposed 'A' block
+                        {
+#ifdef CYCLE_PROFILING
+                            auto p=prof.ScopedProfiler(PROFILE_PREPA, strategy::out_height() * (kmax-k0) * sizeof(Toi));
+#endif
+                            // See comment above on transform_type<> class: this extracts either 'transforms' or
+                            // 'transforms_quantized' as appropriate.
+                            typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
+
+                            if (_indirect_buf != nullptr) {
+                                transforms.PrepareA_indirect(a_panel,
+                                                             _indirect_buf + (multi * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
+                                                             _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
+                            } else if (_convolver) {
+                                transforms.PrepareA_convolution(a_panel,
+                                                                this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
+                                                                this->_lda, *_convolver, _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
+                            } else {
+                                transforms.PrepareA(a_panel,
+                                                    this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
+                                                    this->_lda, start_row, end_row, k0, std::min(kmax, _Ksize), row_sum_multiplier());
+                            }
+                        }
+
+                        // Perform the kernel and merge step, either separately or together as required.
+                        kernel_and_merge<MergeStep, OutputStage>::run(
+                        #ifdef CYCLE_PROFILING
+                            prof,
+                        #endif
+                            // Strategy and panel pointers
+                            strat, a_panel, b_ptr, c_panel,
+                            // Result buffer pointers
+                            this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc,
+                            // K size, and M/N ranges
+                            kern_k, start_row, end_row, start_x, end_x,
+                            // Only do bias on the first pass
+                            ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr),
+                            // Only do activation on the last pass, and accumulation on any non-first pass.
+                            (last_pass ? _act : Activation()), !first_pass,
+                            // Pass in quantization parameters for requantizing kernels (others will ignore)
+                            _os, col_bias + (multi * _Nsize),
+                            // Accumulation buffer (not yet implemented on this path)
+                            static_cast<Tab *>(nullptr));
+
+                        /* Increment to the next block */
+                        start_row += strategy::out_height();
+                        if (start_row >= _Msize) {
+                            start_row = 0;
+                            batch++;
+                        }
+                    }
+                }
             }
+        } else {
+            blockwalker current(*this);
+
+            /* Compute the M values to operate on */
+            unsigned int m_0   = (start - (batch_0 * window_per_batch)) * strategy::out_height();
+            unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
+
+            // Private buffers.  Treat working_space as an array of C buffers
+            // (one per thread) first, followed by the (window-divided) A
+            // buffer.
+            // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
+            Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
+            Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
 
-            int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
+            const Toi *b_panel;
+            b_panel = _B_transposed;
 
-            /* Do the actual work. */
-            for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
-                unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
-                unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+            // newkblock() is always true on the first iteration, so these will be set properly on the first loop.
 
-                const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
+            // kern_k tracks the accumulation depth for the CURRENT K block a_panel_stride similarly tracks the total
+            // stride of the A panel (i.e.  with 4 added for cases with embedded row sums)
 
-                if (first_m >= last_m)
-                    continue;
+            // These are distinct from k_block and get_total_k_depth() which are based on the target K block size, and
+            // used for addressing inside a_panel.
 
-                for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
-                    unsigned int ymax = std::min(_Msize, y + strategy::out_height());
+            // In cases where K blocking is in use and the blocks are not all the same size, the (smaller) final block
+            // won't use all the memory allocated.
+            unsigned int kern_k = 0;
+            unsigned int a_panel_stride = 0;
 
-                    {
+            for (;!current.done();current.advance()) {
+                if (current.newkblock()) {
 #ifdef CYCLE_PROFILING
-                        auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+                    auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));
 #endif
+                    // See comment above on transform_type<> class: this extracts either 'transforms' or
+                    // 'transforms_quantized' as appropriate.
+                    typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
+
+                    for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+                        unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
+                        unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+
+                        if (first_m >= last_m)
+                            continue;
+
+                        if (_indirect_buf != nullptr) {
+                            transforms.PrepareA_indirect(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
+                                                      _indirect_buf + (current.multi() * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
+                                                      _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
+                        } else if (_convolver) {
+                            transforms.PrepareA_convolution(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
+                                                      this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+                                                      this->_lda, *_convolver, _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
+                        } else {
+                            transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
+                                                      this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+                                                      this->_lda, first_m, last_m, current.k0(), std::min(_Ksize, current.kmax()), row_sum_multiplier());
+                        }
+                    }
+
+                    // Figure out how many "K" the kernel will actually process.
+                    kern_k = roundup(current.kmax() - current.k0(), strategy::k_unroll());
 
-                        strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+                    // Requantizing GEMMs have the row sums built in to the
+                    // transposed data, so the stride between rows is 4 bytes
+                    // larger than the (rounded) K value.
 
-                        a_ptr += (strategy::out_height() * kern_k);
+                    if(std::is_same<OutputStage, Requantize32>::value) {
+                        a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Toi));
+                    } else {
+                        a_panel_stride = kern_k;
                     }
+                }
 
-                    {
-#ifdef CYCLE_PROFILING
-                        auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
-#endif
-                        /* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */
-                        const bool first_pass = current.k0()==0;
-                        const bool last_pass  = current.kmax()==_Ksize;
-
-                        strat.transforms.Merge(this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride),
-                                               c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(),
-                                               ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
-                                               (last_pass ? _act : Activation()), !first_pass);
+                /* Do the actual work. */
+                for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+                    unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
+                    unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+
+                    const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * get_total_k_depth();
+
+                    if (first_m >= last_m)
+                        continue;
+
+                    // For the merge case we need to do this out_height() rows
+                    // at a time, as that is the size of our intermediate
+                    // buffer.  If we are not doing that, we can do all the
+                    // relevant rows in one go.
+                    unsigned int m_step = MergeStep ? strategy::out_height() : (last_m - first_m);
+
+                    // But in the case where we have an accumulation buffer, we can't do that after all, unless
+                    // there is no N blocking.
+                    if (_accumulation_buffer && ((current.x0() != 0) || (current.xmax() < _Nsize))) {
+                        m_step = strategy::out_height();
+                    }
+
+                    for (unsigned int y=first_m; y<last_m; y+=m_step) {
+                        unsigned int ymax = std::min(_Msize, y + m_step);
+
+                        const bool first_pass = (current.k0() == 0);
+                        const bool last_pass  = (current.kmax() == _Ktotal);
+
+                        // Pointer to appropriate part of result array.
+                        Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride);
+
+                        // If we are using an accumulation buffer, we don't pass the result buffer to ask the kernel
+                        // to write things into the accumulation buffer instead, except on the last pass.
+                        if (_accumulation_buffer && !last_pass) {
+                            result_ptr = nullptr;
+                        }
+
+                        // Perform the kernel and merge step, either separately or together as required.
+                        kernel_and_merge<MergeStep, OutputStage>::run(
+                        #ifdef CYCLE_PROFILING
+                            prof,
+                        #endif
+                            // Strategy and panel pointers
+                            strat, a_ptr, b_panel, c_panel,
+                            // Result buffer pointers
+                            result_ptr, this->_ldc,
+                            // K size, and M/N ranges
+                            kern_k, y, ymax, current.x0(), current.xmax(),
+                            // Only do bias on the first pass
+                            ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
+                            // Only do activation on the last pass, and accumulation on any non-first pass.
+                            (last_pass ? _act : Activation()), !first_pass,
+                            // Pass in quantization parameters for requantizing kernels (others will ignore)
+                            _os, col_bias + (current.multi() * _Nsize),
+                            // Accumulation buffer
+                            get_accumulation_buffer(y, current.x0(), batch, current.multi()) );
+
+                        a_ptr += (strategy::out_height() * a_panel_stride);
                     }
                 }
-            }
 
-            b_panel += (bblocks * strat.out_width() * kern_k);
+                b_panel += (roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
+            }
         }
     }
 
     // Interface implementation - working space
     size_t get_working_size() const override {
-        // In all cases, we need one A buffer plus a C buffer per thread.
-        size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);
+        // In all cases, we need one A buffer plus a C buffer per thread, plus an accumulation buffer.
+        size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads) + get_accumulation_buffer_size();
 
-        size += 64; // Add on a cache line extra for alignment.
+        size += 128; // Add on two cache lines extra for alignment.
 
         return size;
     }
@@ -362,9 +890,22 @@ public:
         }
 
         working_space_bytes += diff;
+        working_space_int += diff;
 
         // Pretransposed case: just set internal pointer to parameter value.
         _working_space = reinterpret_cast<void *>(working_space_bytes);
+
+        // Set up accumulation buffer
+        if (get_accumulation_buffer_size() > 0) {
+            intptr_t acc_buff_int = working_space_int + get_a_working_size() + (get_c_working_size() * _maxthreads);
+            // Make sure the accumulation buffer is aligned (needed if the other blocks are not a multiple of cache line length)
+            if (acc_buff_int & 0x3F) {
+                acc_buff_int += (0x40 - (acc_buff_int & 0x3F));
+            }
+            _accumulation_buffer = reinterpret_cast<Tab *>(acc_buff_int);
+        } else {
+            _accumulation_buffer = nullptr;
+        }
     }
 
     // Interface implementation - pretransposed
@@ -376,56 +917,105 @@ public:
         return (_B_transposed==nullptr);
     }
 
-    // TODO: this could almost certainly be considerably simpler.
     size_t get_B_pretransposed_array_size() const override {
-        size_t total=0;
-        blockwalker current(*this);
+        unsigned int x_size = roundup(_Nsize, strategy::out_width());
 
-        do {
-            /* Figure out the size of each block. */
-            unsigned int x_size = (current.xmax() - current.x0());
-            unsigned int k_size = (current.kmax() - current.k0());
+        return (x_size * _Ktotal * _nmulti * sizeof(Toi)) + get_col_sum_size();
+    }
 
-            /* Round sizes up as needed. */
-            x_size = iceildiv(x_size, strategy::out_width());
-            x_size *= strategy::out_width();
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            col_bias = reinterpret_cast<int32_t *>(in_buffer);
 
-            k_size = iceildiv(k_size, strategy::k_unroll());
-            k_size *= strategy::k_unroll();
+            Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
 
-            total += x_size * k_size * sizeof(Toi);
-        } while (current.advance());
+            for (unsigned int i=0; i<_nmulti; i++) {
+                // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size.
+                compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0);
+            }
+        }
 
-        return total;
-    }
+        // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+        Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        _B_transposed = buffer;
 
-    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         blockwalker current(*this);
-        Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
-        _B_transposed = buffer;
         strategy strat(_ci);
 
         do {
             /* Figure out the size of each block. */
-            unsigned int x_size = (current.xmax() - current.x0());
             unsigned int k_size = (current.kmax() - current.k0());
 
-            /* Round sizes up as needed. */
-            x_size = iceildiv(x_size, strategy::out_width());
-            x_size *= strategy::out_width();
+            // We need to insert padding at the end of each K section.
+            // The computation needed is a little delicate - the coordinates from the block walker are expressed in
+            // terms of the full, padded, _Ktotal.
+            // But we need to transform each section with reference to the original, unpadded, input, letting the
+            // transform pad each section as needed.
+
+            // This is needed for computations below.
+            const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll());
+
+            // The expected output format is also an entire <out_width> columns interleaved, then the next set of
+            // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
+            // a time.
+            for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ){
+                unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax());
+
+                // Track where we are and how much work is left.
+                unsigned int kpos  = current.k0();
+                unsigned int kleft = k_size;
+
+                while (kleft) {
+                    // Which section are we in?  Based on the rounded-up section size.
+                    unsigned int k_section_base = kpos / rounded_section_size;
+                    // How far into the section are we?
+                    unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
+
+                    // We will either copy the rest of this section, or to the end of the requested length.
+                    unsigned int k_length = std::min(_Ksize - k_offset, kleft);
+
+                    strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
+                                              x0, xmax,
+                                              (k_section_base * _Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
+                                              (k_section_base * _Ksize) + k_offset + k_length);   // K end point - starting point plus length computed above.
 
-            k_size = iceildiv(k_size, strategy::k_unroll());
-            k_size *= strategy::k_unroll();
+                    // We need to modify our position based on the ROUNDED version of what we just did.
+                    unsigned int padded_length = roundup(k_length, strategy::k_unroll());
 
-            strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
-                                      current.x0(), current.xmax(), current.k0(), current.kmax());
+                    buffer += strategy::out_width() * padded_length;
 
-            buffer += (x_size * k_size);
+                    kpos  += padded_length;
+                    kleft -= padded_length;
+                }
+            }
         } while (current.advance());
     }
 
     void set_pretransposed_B_data(void *in_buffer) override {
-        _B_transposed = reinterpret_cast<Toi *>(in_buffer);
+        // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+        _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        col_bias = reinterpret_cast<int32_t *>(in_buffer);
+    }
+
+    void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
+
+            qp->bias = bias;
+            qp->bias_multi_stride = bias_multi_stride;
+        }
+    }
+
+    void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
+        assert(string_len == _Ksize);
+        _indirect_buf = ptr;
+    }
+
+    void set_convolution_parameters(ConvolutionParameters parms) override {
+        assert(parms.input_channels == _Ksize);
+        _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
     }
 
     // Estimate cycles for given problem given provided parameters
@@ -454,4 +1044,14 @@ public:
     }
 };
 
+// Aliases for the variations
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
+using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, Tr, OutputStage, false>;
+
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strategy, To, Tr, Requantize32, false>;
+
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedQuantized = GemmInterleaved<strategy, To, Tr, Requantize32>;
+
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
index bdccd05326..b71f390ab9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
@@ -250,7 +250,8 @@ class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> {
                         first_m,
                         last_m,
                         current.k0(),
-                        current.kmax());
+                        current.kmax(),
+                        0);
                 }
             }
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
index 04cac6095c..05c5116bf3 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
@@ -25,68 +25,151 @@
 
 #include "arm_gemm.hpp"
 
-#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp"
-#include "kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp"
-#include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp"
-#include "kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp"
+#include "kernels/a64_gemm_s16_8x12.hpp"
+#include "kernels/a64_gemm_s8_4x4.hpp"
+#include "kernels/a64_gemm_s8_8x12.hpp"
+#include "kernels/a64_hybrid_s8qa_dot_4x16.hpp"
+#include "kernels/a64_hybrid_s8qs_dot_6x16.hpp"
+#include "kernels/a64_hybrid_s8s32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp"
 
+#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_s8qa_dot_4x4VL.hpp"
+#include "kernels/sve_hybrid_s8qs_dot_6x4VL.hpp"
+#include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp"
+
+#include "gemm_hybrid_indirect.hpp"
 #include "gemm_hybrid_quantized.hpp"
+#include "gemm_hybrid_quantized_inline.hpp"
+#include "gemm_interleaved.hpp"
 #include "quantize_wrapper.hpp"
+#include "utils.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<int8_t, int8_t, Requantize32> gemm_qint8_methods[] =
 {
 #ifdef __ARM_FEATURE_SVE
+#ifdef MMLA_INT8
 {
-    GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "smallK_hybrid_s8s32_dot_1VLx8",
-    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64; },
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_s8s32_mmla_8x3VL",
+    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_s8s32_dot_1VLx8, int8_t, int8_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t>(args, qp); }
 },
+#endif
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "hybrid_s8s32_dot_4VLx4",
-    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize>=16; },
-    [](const GemmArgs &args, const Requantize32 &) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_s8s32_dot_4VLx4, int8_t, int8_t>(args, qp); }
+    "sve_smallK_hybrid_s8s32_dot_8x1VL",
+    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._indirect_input; },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_sve_smallK_hybrid_s8s32_dot_8x1VL, int8_t, int8_t>(args, qp); }
+},
+#ifdef SVE2
+{
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_s8qs_dot_6x4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_symmetric(qp); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qs_dot_6x4VL, int8_t, int8_t, Requantize32>(args, qp); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_s8qa_dot_4x4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qa_dot_4x4VL, int8_t, int8_t, Requantize32>(args, qp); }
 },
 #endif
 {
-    GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "smallK_hybrid_s8s32_dot_4x8",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_s8s32_dot_6x4VL",
+    nullptr,
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, Requantize32, true>(args, qp); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_s8s32_dot_8x3VL",
+    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_s8s32_dot_4x8, int8_t, int8_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t>(args, qp); }
 },
+#endif // SVE
+#ifdef MMLA_INT8
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_interleaved_s8s32_mmla_8x12",
+    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t>(args, qp); }
+},
+#endif
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "smallK_hybrid_s8s32_dot_4x6",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
+    "a64_smallK_hybrid_s8s32_dot_8x4",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_s8s32_dot_4x6, int8_t, int8_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_s8s32_dot_8x4, int8_t, int8_t>(args, qp); }
 },
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "hybrid_s8s32_dot_16x4",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && args._Ksize>=16; },
-    [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; },
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_s8s32_dot_16x4, int8_t, int8_t>(args, qp); }
+    "a64_smallK_hybrid_s8s32_dot_6x4",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_s8s32_dot_6x4, int8_t, int8_t>(args, qp); }
 },
-/** QUANTIZE_WRAPPER_2D enables 2D parallelisation hint for IScheduler in NEGEMMAssemblyDispatch */
 {
-    GemmMethod::QUANTIZE_WRAPPER_2D,
-    "quantized_wrapper_2d",
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_s16_8x12",
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8);},
-    [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<int8_t, int8_t, int32_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() == CPUModel::A53; },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s16_8x12, int8_t, int8_t>(args, qp); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8qs_dot_6x16",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_symmetric(qp); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qs_dot_6x16, int8_t, int8_t, Requantize32>(args, qp); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8qa_dot_4x16",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qa_dot_4x16, int8_t, int8_t, Requantize32>(args, qp); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8s32_dot_6x16",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, Requantize32, true>(args, qp); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_s8_8x12",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_8x12, int8_t, int8_t>(args, qp); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_s8_4x4",
+    nullptr,
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_4x4, int8_t, int8_t>(args, qp); }
 },
 {
     GemmMethod::QUANTIZE_WRAPPER,
     "quantized_wrapper",
-    nullptr,
+    [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; },
     nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<int8_t, int8_t, int32_t>(args, qp); }
 },
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
index 0125f9c5db..7342fda5d1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
@@ -25,13 +25,25 @@
 
 #include "arm_gemm.hpp"
 
-#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp"
-#include "kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp"
-#include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp"
-#include "kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp"
+#include "kernels/a64_gemm_u16_8x12.hpp"
+#include "kernels/a64_gemm_u8_4x4.hpp"
+#include "kernels/a64_gemm_u8_8x12.hpp"
+#include "kernels/a64_hybrid_u8qa_dot_4x16.hpp"
+#include "kernels/a64_hybrid_u8u32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp"
 
+#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_u8qa_dot_4x4VL.hpp"
+#include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp"
+
+#include "gemm_hybrid_indirect.hpp"
 #include "gemm_hybrid_quantized.hpp"
+#include "gemm_hybrid_quantized_inline.hpp"
+#include "gemm_interleaved.hpp"
 #include "quantize_wrapper.hpp"
 
 namespace arm_gemm {
@@ -39,54 +51,108 @@ namespace arm_gemm {
 static const GemmImplementation<uint8_t, uint8_t, Requantize32> gemm_quint8_methods[] =
 {
 #ifdef __ARM_FEATURE_SVE
+#ifdef MMLA_INT8
 {
-    GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "smallK_hybrid_u8u32_dot_1VLx8",
-    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64; },
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_u8u32_mmla_8x3VL",
+    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_u8u32_dot_1VLx8, uint8_t, uint8_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t>(args, qp); }
 },
+#endif
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "hybrid_u8u32_dot_4VLx4",
-    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize>=16; },
-    [](const GemmArgs &args, const Requantize32 &) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_u8u32_dot_4VLx4, uint8_t, uint8_t>(args, qp); }
+    "sve_smallK_hybrid_u8u32_dot_8x1VL",
+    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._indirect_input; },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_sve_smallK_hybrid_u8u32_dot_8x1VL, uint8_t, uint8_t>(args, qp); }
+},
+#ifdef SVE2 // Requantizing kernels include some SVE2 only instructions (SQRDMULH, SRSHL)
+{
+    GemmMethod::GEMM_HYBRID, 
+    "sve_hybrid_u8qa_dot_4x4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8qa_dot_4x4VL, uint8_t, uint8_t, Requantize32>(args, qp); }
 },
 #endif
 {
-    GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "smallK_hybrid_u8u32_dot_4x8",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
+    GemmMethod::GEMM_HYBRID, 
+    "sve_hybrid_u8u32_dot_6x4VL",
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_u8u32_dot_4x8, uint8_t, uint8_t>(args, qp); }
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_u8u32_dot_8x3VL",
+    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>4); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t>(args, qp); }
 },
+#endif
+#ifdef MMLA_INT8
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_interleaved_u8u32_mmla_8x12",
+    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t>(args, qp); }
+},
+#endif
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "smallK_hybrid_u8u32_dot_4x6",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
+    "a64_smallK_hybrid_u8u32_dot_8x4",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_u8u32_dot_4x6, uint8_t, uint8_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_u8u32_dot_8x4, uint8_t, uint8_t>(args, qp); }
 },
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "hybrid_u8u32_dot_16x4",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && args._Ksize>=16; },
-    [](const GemmArgs &args, const Requantize32 &) { return ((args._Nsize<=256) && (args._Ksize>128)) || (args._maxthreads >= 8); },
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_u8u32_dot_16x4, uint8_t, uint8_t>(args, qp); }
+    "a64_smallK_hybrid_u8u32_dot_6x4",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_u8u32_dot_6x4, uint8_t, uint8_t>(args, qp); }
 },
-/** QUANTIZE_WRAPPER_2D enables 2D parallelisation hint for IScheduler in NEGEMMAssemblyDispatch */
 {
-    GemmMethod::QUANTIZE_WRAPPER_2D,
-    "quantized_wrapper_2d",
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_u16_8x12",
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8);},
-    [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<uint8_t, uint8_t, uint32_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() == CPUModel::A53; },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u16_8x12, uint8_t, uint8_t>(args, qp); },
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_u8qa_dot_4x16",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); },
+    [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8qa_dot_4x16, uint8_t, uint8_t, Requantize32>(args, qp); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_u8u32_dot_6x16",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
+    [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_u8_8x12",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_8x12, uint8_t, uint8_t>(args, qp); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_u8_4x4",
+    nullptr,
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_4x4, uint8_t, uint8_t>(args, qp); }
 },
 {
     GemmMethod::QUANTIZE_WRAPPER,
     "quantized_wrapper",
-    nullptr,
+    [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; },
     nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<uint8_t, uint8_t, uint32_t>(args, qp); }
 },
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index 5e06443e19..10a35e7a11 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -28,17 +28,17 @@
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
 
-#include "kernels/a64_gemm_u16_12x8.hpp"
+#include "kernels/a64_gemm_u16_8x12.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<uint16_t, uint32_t> gemm_u16_methods[] = {
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_u16_12x8",
+    "a64_gemm_u16_8x12",
     nullptr,
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u16_8x12, uint16_t, uint32_t>(args); }
 },
 {
     GemmMethod::DEFAULT,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index 06e68cbc43..c300b8cdf9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -29,18 +29,20 @@
 #include "gemm_interleaved.hpp"
 #include "gemm_interleaved_pretransposed_2d.hpp"
 #include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
 
-#include "kernels/a64_gemm_u16_12x8.hpp"
-#include "kernels/a64_gemm_u8_12x8.hpp"
+#include "kernels/a64_gemm_u16_8x12.hpp"
 #include "kernels/a64_gemm_u8_4x4.hpp"
-#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp"
-#include "kernels/a64_interleaved_u8u32_mmla_12x8.hpp"
-#include "kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp"
-#include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp"
-#include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
-#include "kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp"
-#include "kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp"
+#include "kernels/a64_gemm_u8_8x12.hpp"
+#include "kernels/a64_hybrid_u8u32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp"
+
+#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp"
+#include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp"
 
 namespace arm_gemm {
 
@@ -49,106 +51,84 @@ static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
 #ifdef MMLA_INT8
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_u8u32_mmla_3VLx8",
+    "sve_interleaved_u8u32_mmla_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>8); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_u8u32_mmla_3VLx8, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint32_t>(args); }
 },
 #endif
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_u8u32_dot_1VLx8",
-    [](const GemmArgs &args) { return args._Ksize<=64; },
+    "smallK_hybrid_u8u32_dot_8x1VL",
+    [](const GemmArgs &args) { return args._Ksize<=64 && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_1VLx8, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_u8u32_dot_8x1VL, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "hybrid_u8u32_dot_4VLx4",
-    [](const GemmArgs &args) { return args._Ksize>=16; },
+    "sve_hybrid_u8u32_dot_6x4VL",
+    nullptr,
     [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_u8u32_dot_4VLx4, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_u8u32_dot_3VLx8",
+    "sve_interleaved_u8u32_dot_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_u8u32_dot_3VLx8, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint32_t>(args); }
 },
 #endif
 #ifdef MMLA_INT8
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_u8u32_mmla_12x8",
+    "a64_interleaved_u8u32_mmla_8x12",
     [](const GemmArgs &args) { return (args._Ksize>8); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_u8u32_mmla_12x8, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint32_t>(args); }
 },
 #endif
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_u8u32_dot_4x8",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
+    "a64_smallK_hybrid_u8u32_dot_8x4",
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_4x8, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_u8u32_dot_8x4, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_u8u32_dot_4x6",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
+    "a64_smallK_hybrid_u8u32_dot_6x4",
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_4x6, uint8_t, uint32_t>(args); }
-},
-{
-    GemmMethod::GEMM_HYBRID,
-    "hybrid_u8u32_dot_16x4",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16; },
-    [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_u8u32_dot_16x4, uint8_t, uint32_t>(args); }
-},
-{
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "gemm_u8_12x8_2d",
-    [](const GemmArgs &args) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8) ; },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_u8_12x8, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_u8u32_dot_6x4, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_u8_12x8_1d",
-    [](const GemmArgs &args) { return args._ci->has_dotprod(); },
+    "a64_gemm_u16_8x12",
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53; },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u16_8x12, uint8_t, uint32_t>(args); },
 },
 {
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "gemm_u16_12x8_2d",
-    nullptr,
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4 && (args._Msize / args._maxthreads) < 8; },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_u16_12x8, uint8_t, uint32_t>(args); },
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_u8u32_dot_6x16",
+    [](const GemmArgs &args) { return args._ci->has_dotprod(); },
+    [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_u16_12x8_1d",
-    nullptr,
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4; },
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t>(args); },
-},
-{
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "gemm_u8_4x4_2d",
+    "a64_gemm_u8_8x12",
+    [](const GemmArgs &args) { return args._ci->has_dotprod(); },
     nullptr,
-    [](const GemmArgs &args) { return ((args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8)) ||
-                                       ((args._Msize / args._maxthreads) < 4); },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_u8_4x4, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_8x12, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_u8_4x4_1d",
+    "a64_gemm_u8_4x4",
     nullptr,
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_4x4, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::DEFAULT,
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index 47909cdaeb..9de44fcb73 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -46,46 +46,39 @@ class GemvPretransposed : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
     typedef typename strategy::result_type Tri;
 
-    const unsigned int _Nsize;
-    const unsigned int _Ksize;
-
-    const unsigned int _nmultis;
-
-    const Activation _act;
-
-    const CPUInfo * const _ci;
+    const GemmArgs     _args;
 
     const unsigned int _buffer_per_multi;
 
-    unsigned int m_block=0;
+    unsigned int k_block=0;
     unsigned int n_block=0;
 
-    const Toi *_A_pretransposed = nullptr;
+    const Toi *_B_pretransposed = nullptr;
 
 public:
     GemvPretransposed(GemvPretransposed &) = delete;
     GemvPretransposed & operator= (GemvPretransposed &) = delete;
 
     GemvPretransposed(const GemmArgs &args)
-                      : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _act(args._act), _ci(args._ci),
-                        _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave()) * strategy::A_interleave()) {
+                      : _args(args),
+                        _buffer_per_multi(args._Ksize * roundup(args._Nsize, strategy::out_width())) {
         /* For now don't do any blocking. TODO: figure out if we should. */
-        if (args._cfg && args._cfg->inner_block_size) {
-            m_block = args._cfg->inner_block_size;
+        if (strategy::supports_accumulate() && args._cfg && args._cfg->inner_block_size) {
+            k_block = args._cfg->inner_block_size;
         } else {
-            m_block = _Ksize;
+            k_block = args._Ksize;
         }
 
         if (args._cfg && args._cfg->outer_block_size) {
             n_block = args._cfg->outer_block_size;
         } else {
-            n_block = _Nsize;
+            n_block = args._Nsize;
         }
     }
 
     // Window is number of out_width blocks, times number of multis.
     ndrange_t get_window_size() const override {
-        return { iceildiv(_Nsize, strategy::out_width()) * _nmultis };
+        return { iceildiv(_args._Nsize, strategy::out_width()) * _args._nmulti };
     }
 
     // Actually execute the GEMV.
@@ -93,13 +86,13 @@ public:
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
-        strategy strat(_ci);
+        strategy strat(_args._ci);
 
         const auto start = work_range.get_position(0);
         const auto end   = work_range.get_position_end(0);
 
         /* Break the window values down into multis of interest... */
-        const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width());
+        const unsigned int window_per_multi = iceildiv(_args._Nsize, strategy::out_width());
         const unsigned int multi_0    = start / window_per_multi;
         const unsigned int multi_end  = end   / window_per_multi;
 
@@ -111,36 +104,25 @@ public:
 
         for (unsigned int multi=multi_0; multi<=multi_end; multi++) {
             const unsigned int n_start = (multi==multi_0) ? n_0 : 0;
-            const unsigned int n_end = (multi==multi_end) ? n_max : _Nsize;
+            const unsigned int n_end = (multi==multi_end) ? n_max : _args._Nsize;
 
             if (n_end <= n_start)
                 continue;
 
-            for (unsigned int m0=0; m0<_Ksize; m0+=m_block) {
-                unsigned int mmax = std::min(m0 + m_block, _Ksize);
+            for (unsigned int k0=0; k0<_args._Ksize; k0+=k_block) {
+                unsigned int kmax = std::min(k0 + k_block, _args._Ksize);
 
                 for (unsigned int n=n_start; n<n_end; n+=n_block) {
                     unsigned int nmax = std::min(n + n_block, n_end);
 #ifdef CYCLE_PROFILING
-                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax-m0) * (nmax-n));
+                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (kmax-k0) * (nmax-n));
 #endif
-                    /* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */
-                    strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave()),
-                                 (_Ksize * strategy::A_interleave()),
-                                 this->_Aptr + (multi * this->_A_multi_stride) + m0,
+                    strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + k0,
+                                 _B_pretransposed + (multi * _buffer_per_multi) + (n * roundup(_args._Ksize, strategy::k_unroll())) + (k0 * strategy::out_width()),
                                  this->_Cptr + (multi * this->_C_multi_stride) + n,
-                                 static_cast<Tr>(0), (mmax-m0), (nmax-n));
-
-                    // Handle activation separately for now
-                    if (this->_bias) {
-                        activator<true>(this->_Cptr + (multi * this->_C_multi_stride) + n, 0,
-                                        this->_bias + (multi * this->_bias_multi_stride) + n,
-                                        _act, 1, (nmax-n));
-                    } else {
-                        activator<false>(this->_Cptr + (multi * this->_C_multi_stride) + n, 0,
-                                         static_cast<const Tr *>(nullptr),
-                                         _act, 1, (nmax-n));
-                    }
+                                 (nmax - n), (kmax-k0),
+                                 this->_bias ? this->_bias + (multi * this->_bias_multi_stride) + n : nullptr,
+                                 _args._act, (k0 != 0));
                 }
             }
         }
@@ -152,33 +134,27 @@ public:
     }
 
     bool B_pretranspose_required() const override {
-        /* Transpose is required if _A_pretransposed is still nullptr */
-        return (_A_pretransposed == nullptr);
+        /* Transpose is required if _B_pretransposed is still nullptr */
+        return (_B_pretransposed == nullptr);
     }
 
     size_t get_B_pretransposed_array_size() const override {
-        return _buffer_per_multi * _nmultis * sizeof(To);
+        return _buffer_per_multi * _args._nmulti * sizeof(To);
     }
 
     void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
-        Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
-
-        for (unsigned int multi=0; multi<_nmultis; multi++) {
-            /* Reverse sense here as we are dealing with B rather than A.  So if
-             * strategy::A_transpose is false and _trB is false, we still
-             * transpose.  */
-            if (strategy::A_transpose()) {
-                Transform<strategy::A_interleave(), strategy::A_block(), false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
-            } else {
-                Transform<strategy::A_interleave(), strategy::A_block(), true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
-            }
+        Toi *B_buffer = reinterpret_cast<Toi *>(buffer);
+        strategy strat(_args._ci);
+
+        for (unsigned int multi=0; multi<_args._nmulti; multi++) {
+            strat.transforms.PrepareB(B_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _args._Nsize, 0, _args._Ksize);
         }
 
-        _A_pretransposed = A_buffer;
+        _B_pretransposed = B_buffer;
     }
 
     void set_pretransposed_B_data(void *buffer) override {
-        _A_pretransposed = reinterpret_cast<Toi *>(buffer);
+        _B_pretransposed = reinterpret_cast<Toi *>(buffer);
     }
 };
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp
new file mode 100644
index 0000000000..807511f0d2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2017-2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template<>
+void interleave_block<6, 1, VLType::None, false>(
+    float * &outptr, const float * const * in, size_t width, size_t height,
+    size_t row_offset, bool
+)
+{
+    const float *inptr0 = in[0] + row_offset;
+    const float *inptr1 = in[1] + row_offset;
+    const float *inptr2 = in[2] + row_offset;
+    const float *inptr3 = in[3] + row_offset;
+    const float *inptr4 = in[4] + row_offset;
+    const float *inptr5 = in[5] + row_offset;
+
+    // Cope with ragged cases by aliasing the first row (which is always valid).
+    // The nonsense output produced will be suppressed later anyway.
+    switch (height) {
+        case 1:
+            inptr1 = inptr0;
+            // fall through
+        case 2:
+            inptr2 = inptr0;
+            // fall through
+        case 3:
+            inptr3 = inptr0;
+            // fall through
+        case 4:
+            inptr4 = inptr0;
+            // fall through
+        case 5:
+            inptr5 = inptr0;
+            // fall through
+        default:
+        case 6:
+            break;
+    }
+
+    //prefetch_2x(inptr0);
+    //prefetch_2x(inptr1);
+    //prefetch_2x(inptr2);
+    //prefetch_2x(inptr3);
+    //prefetch_2x(inptr4);
+    //prefetch_2x(inptr5);
+
+    for (;width>7;width-=8) {
+        __asm __volatile (
+            // Load up 8 elements (2 vectors) from each of 8 sources.
+            "VLD1.32	{d0-d3}, [%[inptr0]]!\n"   // q0=A0A1A2A3
+            "VLD1.32	{d4-d7}, [%[inptr1]]!\n"   // q2=B0B1B2B3
+            "VLD1.32	{d8-d11}, [%[inptr2]]!\n"  // q4=C0C1C2C3
+            "VZIP.32	q0, q4\n"     // q0=A0C0A1C1, q4 = A2C2A3C3
+            "VLD1.32	{d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
+            "VZIP.32	q2, q6\n"     // q2=B0D0B1D1, q6 = B2D2B3D3
+            "VLD1.32	{d16-d19}, [%[inptr4]]!\n"
+            "VLD1.32	{d20-d23}, [%[inptr5]]!\n"
+            "VZIP.32	q8, q10\n"    // q8=E0F0E1F1, q10 = E2F2E3F3
+            ASM_PREFETCH("[%[inptr0], #128]")
+            "VZIP.32	q0, q2\n"    // q0 = A0B0C0D0, q2 = A1B1C1D1
+
+            // Store first elements
+            "VST1.32	{d0-d1}, [%[outptr]]!\n"
+            "VST1.32	{d16}, [%[outptr]]!\n"
+
+            "VZIP.32	q4, q6\n"    // q4 = A2B2C2D2, q6 = A3B3C3D3
+
+            // Store second elements
+            "VST1.32	{d4-d5}, [%[outptr]]!\n"
+            "VZIP.32	q1, q5\n"
+            ASM_PREFETCH("[%[inptr1], #128]")
+            "VST1.32	{d17}, [%[outptr]]!\n"
+            "VZIP.32	q3, q7\n"
+
+            // Store third elements
+            "VZIP.32	q9, q11\n"
+            "VST1.32	{d8-d9}, [%[outptr]]!\n"
+            "VZIP.32	q1, q3\n"
+            ASM_PREFETCH("[%[inptr2], #128]")
+            "VST1.32	{d20}, [%[outptr]]!\n"
+
+            // Store fourth elements
+            "VZIP.32	q5, q7\n"
+            "VST1.32	{d12-d13}, [%[outptr]]!\n"
+            ASM_PREFETCH("[%[inptr3], #128]")
+            "VST1.32	{d21}, [%[outptr]]!\n"
+
+            // Fifth
+            "VST1.32	{d2-d3}, [%[outptr]]!\n"
+            ASM_PREFETCH("[%[inptr4], #128]")
+            "VST1.32	{d18}, [%[outptr]]!\n"
+
+            // Sixth
+            "VST1.32	{d6-d7}, [%[outptr]]!\n"
+            ASM_PREFETCH("[%[inptr5], #128]")
+            "VST1.32	{d19}, [%[outptr]]!\n"
+
+            // Seventh
+            "VST1.32	{d10-d11}, [%[outptr]]!\n"
+            "VST1.32	{d22}, [%[outptr]]!\n"
+
+            // Eighth
+            "VST1.32	{d14-d15}, [%[outptr]]!\n"
+            "VST1.32	{d23}, [%[outptr]]!\n"
+
+            : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+              [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr)
+            :
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "memory"
+        );
+    }
+
+    for (;width>0;width--) {
+        *outptr++ = *inptr0++;
+        *outptr++ = *inptr1++;
+        *outptr++ = *inptr2++;
+        *outptr++ = *inptr3++;
+        *outptr++ = *inptr4++;
+        *outptr++ = *inptr5++;
+    }
+}
+
+#endif  // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
new file mode 100644
index 0000000000..8054c2b96b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<4, 16, VLType::None, false>(
+  int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x22, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x4\n"
+      "ldr x21, [%x[in], #0x8]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x10]\n"
+      "ldr x19, [%x[in], #0x18]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "add x19, x19, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x19, x22\n"
+      "cmp %x[height], #0x2\n"
+      "csel x21, x21, x22, GE\n"
+      "csel x20, x20, x22, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "cmp %x[width], #0x10\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x19, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "prfm pldl1keep, [x19, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q19, [x22], #0x10\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q18, [x21], #0x10\n"
+      "ldr q17, [x20], #0x10\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x19], #0x10\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "prfm pldl1keep, [x19, #0x70]\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "cmp %x[width], #0x10\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 12f\n"
+      "tbz %x[width], #3, 7f\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d18, [x21], #0x8\n"
+      "ldr d17, [x20], #0x8\n"
+      "ldr d16, [x19], #0x8\n"
+      "tbz %x[width], #2, 5f\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "ld1 { v18.s }[2], [x21], #0x4\n"
+      "ld1 { v17.s }[2], [x20], #0x4\n"
+      "ld1 { v16.s }[2], [x19], #0x4\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v19.h }[6], [x22], #0x2\n"
+      "ld1 { v18.h }[6], [x21], #0x2\n"
+      "ld1 { v17.h }[6], [x20], #0x2\n"
+      "ld1 { v16.h }[6], [x19], #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[14], [x22]\n"
+      "ld1 { v18.b }[14], [x21]\n"
+      "ld1 { v17.b }[14], [x20]\n"
+      "ld1 { v16.b }[14], [x19]\n"
+      "b 11f\n"
+      "4:"  // odd_loads_1_12
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[12], [x22]\n"
+      "ld1 { v18.b }[12], [x21]\n"
+      "ld1 { v17.b }[12], [x20]\n"
+      "ld1 { v16.b }[12], [x19]\n"
+      "b 11f\n"
+      "5:"  // odd_loads_2_8
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v19.h }[4], [x22], #0x2\n"
+      "ld1 { v18.h }[4], [x21], #0x2\n"
+      "ld1 { v17.h }[4], [x20], #0x2\n"
+      "ld1 { v16.h }[4], [x19], #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[10], [x22]\n"
+      "ld1 { v18.b }[10], [x21]\n"
+      "ld1 { v17.b }[10], [x20]\n"
+      "ld1 { v16.b }[10], [x19]\n"
+      "b 11f\n"
+      "6:"  // odd_loads_1_8
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[8], [x22]\n"
+      "ld1 { v18.b }[8], [x21]\n"
+      "ld1 { v17.b }[8], [x20]\n"
+      "ld1 { v16.b }[8], [x19]\n"
+      "b 11f\n"
+      "7:"  // odd_loads_4_0
+      "tbz %x[width], #2, 9f\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s18, [x21], #0x4\n"
+      "ldr s17, [x20], #0x4\n"
+      "ldr s16, [x19], #0x4\n"
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v19.h }[2], [x22], #0x2\n"
+      "ld1 { v18.h }[2], [x21], #0x2\n"
+      "ld1 { v17.h }[2], [x20], #0x2\n"
+      "ld1 { v16.h }[2], [x19], #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[6], [x22]\n"
+      "ld1 { v18.b }[6], [x21]\n"
+      "ld1 { v17.b }[6], [x20]\n"
+      "ld1 { v16.b }[6], [x19]\n"
+      "b 11f\n"
+      "8:"  // odd_loads_1_4
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[4], [x22]\n"
+      "ld1 { v18.b }[4], [x21]\n"
+      "ld1 { v17.b }[4], [x20]\n"
+      "ld1 { v16.b }[4], [x19]\n"
+      "b 11f\n"
+      "9:"  // odd_loads_2_0
+      "tbz %x[width], #1, 10f\n"
+      "ldr h19, [x22], #0x2\n"
+      "ldr h18, [x21], #0x2\n"
+      "ldr h17, [x20], #0x2\n"
+      "ldr h16, [x19], #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[2], [x22]\n"
+      "ld1 { v18.b }[2], [x21]\n"
+      "ld1 { v17.b }[2], [x20]\n"
+      "ld1 { v16.b }[2], [x19]\n"
+      "b 11f\n"
+      "10:"  // odd_loads_1_0
+      "ldr b19, [x22, #0x0]\n"
+      "ldr b18, [x21, #0x0]\n"
+      "ldr b17, [x20, #0x0]\n"
+      "ldr b16, [x19, #0x0]\n"
+      "11:"  // Odd load end
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "12:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "x19", "x20", "x21", "x22"
+    );
+}
+
+template<>
+void interleave_block<4, 16, VLType::None, false>(
+  uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  int8_t * &out_cast = reinterpret_cast<int8_t * &>(out_ptr);
+  const int8_t * const * in_cast = reinterpret_cast<const int8_t * const *>(in);
+
+  interleave_block<4, 16, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false);
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
new file mode 100644
index 0000000000..1650916f9f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<4, 16, VLType::None, true>(
+  int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v28.8h, #0x0\n"
+      "ldr x23, [%x[in], #0x0]\n"
+      "mov x22, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "ldr x21, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x4\n"
+      "movi v26.8h, #0x0\n"
+      "ldr x20, [%x[in], #0x10]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "movi v25.8h, #0x0\n"
+      "ldr x19, [%x[in], #0x18]\n"
+      "movi v24.4s, #0x0\n"
+      "add x21, x21, %x[row_offset]\n"
+      "movi v23.4s, #0x0\n"
+      "add x20, x20, %x[row_offset]\n"
+      "movi v22.4s, #0x0\n"
+      "add x19, x19, %x[row_offset]\n"
+      "movi v21.4s, #0x0\n"
+      "beq 1f\n"
+      "mov x19, x23\n"
+      "cmp %x[height], #0x2\n"
+      "csel x21, x21, x23, GE\n"
+      "csel x20, x20, x23, GT\n"
+      "1:"  // no_pointer_adj
+      "movi v20.4s, #0x0\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x19, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "prfm pldl1keep, [x19, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x10\n"
+      "ld1 { v20.4s }, [%x[out_ptr]]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x10\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x22, #0x7e\n"
+      "ble 4f\n"
+      "sadalp v24.4s, v28.8h\n"
+      "movi v28.8h, #0x0\n"
+      "sadalp v23.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "sadalp v22.4s, v26.8h\n"
+      "movi v26.8h, #0x0\n"
+      "sadalp v21.4s, v25.8h\n"
+      "movi v25.8h, #0x0\n"
+      "mov x22, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q19, [x23], #0x10\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x21], #0x10\n"
+      "ldr q17, [x20], #0x10\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x19], #0x10\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "sadalp v28.8h, v19.16b\n"
+      "prfm pldl1keep, [x19, #0x70]\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "sadalp v27.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "sadalp v26.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "sadalp v25.8h, v16.16b\n"
+      "add x22, x22, #0x1\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "cmp %x[width], #0x10\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 14f\n"
+      "tbz %x[width], #3, 9f\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d18, [x21], #0x8\n"
+      "ldr d17, [x20], #0x8\n"
+      "ldr d16, [x19], #0x8\n"
+      "tbz %x[width], #2, 7f\n"
+      "ld1 { v19.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x21], #0x4\n"
+      "ld1 { v17.s }[2], [x20], #0x4\n"
+      "ld1 { v16.s }[2], [x19], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v19.h }[6], [x23], #0x2\n"
+      "ld1 { v18.h }[6], [x21], #0x2\n"
+      "ld1 { v17.h }[6], [x20], #0x2\n"
+      "ld1 { v16.h }[6], [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[14], [x23]\n"
+      "ld1 { v18.b }[14], [x21]\n"
+      "ld1 { v17.b }[14], [x20]\n"
+      "ld1 { v16.b }[14], [x19]\n"
+      "b 13f\n"
+      "6:"  // odd_loads_1_12
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[12], [x23]\n"
+      "ld1 { v18.b }[12], [x21]\n"
+      "ld1 { v17.b }[12], [x20]\n"
+      "ld1 { v16.b }[12], [x19]\n"
+      "b 13f\n"
+      "7:"  // odd_loads_2_8
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v19.h }[4], [x23], #0x2\n"
+      "ld1 { v18.h }[4], [x21], #0x2\n"
+      "ld1 { v17.h }[4], [x20], #0x2\n"
+      "ld1 { v16.h }[4], [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[10], [x23]\n"
+      "ld1 { v18.b }[10], [x21]\n"
+      "ld1 { v17.b }[10], [x20]\n"
+      "ld1 { v16.b }[10], [x19]\n"
+      "b 13f\n"
+      "8:"  // odd_loads_1_8
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[8], [x23]\n"
+      "ld1 { v18.b }[8], [x21]\n"
+      "ld1 { v17.b }[8], [x20]\n"
+      "ld1 { v16.b }[8], [x19]\n"
+      "b 13f\n"
+      "9:"  // odd_loads_4_0
+      "tbz %x[width], #2, 11f\n"
+      "ldr s19, [x23], #0x4\n"
+      "ldr s18, [x21], #0x4\n"
+      "ldr s17, [x20], #0x4\n"
+      "ldr s16, [x19], #0x4\n"
+      "tbz %x[width], #1, 10f\n"
+      "ld1 { v19.h }[2], [x23], #0x2\n"
+      "ld1 { v18.h }[2], [x21], #0x2\n"
+      "ld1 { v17.h }[2], [x20], #0x2\n"
+      "ld1 { v16.h }[2], [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[6], [x23]\n"
+      "ld1 { v18.b }[6], [x21]\n"
+      "ld1 { v17.b }[6], [x20]\n"
+      "ld1 { v16.b }[6], [x19]\n"
+      "b 13f\n"
+      "10:"  // odd_loads_1_4
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[4], [x23]\n"
+      "ld1 { v18.b }[4], [x21]\n"
+      "ld1 { v17.b }[4], [x20]\n"
+      "ld1 { v16.b }[4], [x19]\n"
+      "b 13f\n"
+      "11:"  // odd_loads_2_0
+      "tbz %x[width], #1, 12f\n"
+      "ldr h19, [x23], #0x2\n"
+      "ldr h18, [x21], #0x2\n"
+      "ldr h17, [x20], #0x2\n"
+      "ldr h16, [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[2], [x23]\n"
+      "ld1 { v18.b }[2], [x21]\n"
+      "ld1 { v17.b }[2], [x20]\n"
+      "ld1 { v16.b }[2], [x19]\n"
+      "b 13f\n"
+      "12:"  // odd_loads_1_0
+      "ldr b19, [x23, #0x0]\n"
+      "ldr b18, [x21, #0x0]\n"
+      "ldr b17, [x20, #0x0]\n"
+      "ldr b16, [x19, #0x0]\n"
+      "13:"  // Odd load end
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "sadalp v28.8h, v19.16b\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "sadalp v27.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "sadalp v26.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "sadalp v25.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "14:"  // Odds skip
+      "sadalp v24.4s, v28.8h\n"
+      "sadalp v23.4s, v27.8h\n"
+      "addp v24.4s, v24.4s, v23.4s\n"
+      "sadalp v22.4s, v26.8h\n"
+      "sadalp v21.4s, v25.8h\n"
+      "addp v23.4s, v22.4s, v21.4s\n"
+      "addp v24.4s, v24.4s, v23.4s\n"
+      "add v24.4s, v24.4s, v20.4s\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
new file mode 100644
index 0000000000..af3efb25b2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<4, 16, VLType::None, true>(
+  uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v28.8h, #0x0\n"
+      "ldr x23, [%x[in], #0x0]\n"
+      "mov x22, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "ldr x21, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x4\n"
+      "movi v26.8h, #0x0\n"
+      "ldr x20, [%x[in], #0x10]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "movi v25.8h, #0x0\n"
+      "ldr x19, [%x[in], #0x18]\n"
+      "movi v24.4s, #0x0\n"
+      "add x21, x21, %x[row_offset]\n"
+      "movi v23.4s, #0x0\n"
+      "add x20, x20, %x[row_offset]\n"
+      "movi v22.4s, #0x0\n"
+      "add x19, x19, %x[row_offset]\n"
+      "movi v21.4s, #0x0\n"
+      "beq 1f\n"
+      "mov x19, x23\n"
+      "cmp %x[height], #0x2\n"
+      "csel x21, x21, x23, GE\n"
+      "csel x20, x20, x23, GT\n"
+      "1:"  // no_pointer_adj
+      "movi v20.4s, #0x0\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x19, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "prfm pldl1keep, [x19, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x10\n"
+      "ld1 { v20.4s }, [%x[out_ptr]]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x10\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x22, #0x7e\n"
+      "ble 4f\n"
+      "uadalp v24.4s, v28.8h\n"
+      "movi v28.8h, #0x0\n"
+      "uadalp v23.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "uadalp v22.4s, v26.8h\n"
+      "movi v26.8h, #0x0\n"
+      "uadalp v21.4s, v25.8h\n"
+      "movi v25.8h, #0x0\n"
+      "mov x22, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q19, [x23], #0x10\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x21], #0x10\n"
+      "ldr q17, [x20], #0x10\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x19], #0x10\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "uadalp v28.8h, v19.16b\n"
+      "prfm pldl1keep, [x19, #0x70]\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "uadalp v27.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "uadalp v26.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "uadalp v25.8h, v16.16b\n"
+      "add x22, x22, #0x1\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "cmp %x[width], #0x10\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 14f\n"
+      "tbz %x[width], #3, 9f\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d18, [x21], #0x8\n"
+      "ldr d17, [x20], #0x8\n"
+      "ldr d16, [x19], #0x8\n"
+      "tbz %x[width], #2, 7f\n"
+      "ld1 { v19.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x21], #0x4\n"
+      "ld1 { v17.s }[2], [x20], #0x4\n"
+      "ld1 { v16.s }[2], [x19], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v19.h }[6], [x23], #0x2\n"
+      "ld1 { v18.h }[6], [x21], #0x2\n"
+      "ld1 { v17.h }[6], [x20], #0x2\n"
+      "ld1 { v16.h }[6], [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[14], [x23]\n"
+      "ld1 { v18.b }[14], [x21]\n"
+      "ld1 { v17.b }[14], [x20]\n"
+      "ld1 { v16.b }[14], [x19]\n"
+      "b 13f\n"
+      "6:"  // odd_loads_1_12
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[12], [x23]\n"
+      "ld1 { v18.b }[12], [x21]\n"
+      "ld1 { v17.b }[12], [x20]\n"
+      "ld1 { v16.b }[12], [x19]\n"
+      "b 13f\n"
+      "7:"  // odd_loads_2_8
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v19.h }[4], [x23], #0x2\n"
+      "ld1 { v18.h }[4], [x21], #0x2\n"
+      "ld1 { v17.h }[4], [x20], #0x2\n"
+      "ld1 { v16.h }[4], [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[10], [x23]\n"
+      "ld1 { v18.b }[10], [x21]\n"
+      "ld1 { v17.b }[10], [x20]\n"
+      "ld1 { v16.b }[10], [x19]\n"
+      "b 13f\n"
+      "8:"  // odd_loads_1_8
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[8], [x23]\n"
+      "ld1 { v18.b }[8], [x21]\n"
+      "ld1 { v17.b }[8], [x20]\n"
+      "ld1 { v16.b }[8], [x19]\n"
+      "b 13f\n"
+      "9:"  // odd_loads_4_0
+      "tbz %x[width], #2, 11f\n"
+      "ldr s19, [x23], #0x4\n"
+      "ldr s18, [x21], #0x4\n"
+      "ldr s17, [x20], #0x4\n"
+      "ldr s16, [x19], #0x4\n"
+      "tbz %x[width], #1, 10f\n"
+      "ld1 { v19.h }[2], [x23], #0x2\n"
+      "ld1 { v18.h }[2], [x21], #0x2\n"
+      "ld1 { v17.h }[2], [x20], #0x2\n"
+      "ld1 { v16.h }[2], [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[6], [x23]\n"
+      "ld1 { v18.b }[6], [x21]\n"
+      "ld1 { v17.b }[6], [x20]\n"
+      "ld1 { v16.b }[6], [x19]\n"
+      "b 13f\n"
+      "10:"  // odd_loads_1_4
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[4], [x23]\n"
+      "ld1 { v18.b }[4], [x21]\n"
+      "ld1 { v17.b }[4], [x20]\n"
+      "ld1 { v16.b }[4], [x19]\n"
+      "b 13f\n"
+      "11:"  // odd_loads_2_0
+      "tbz %x[width], #1, 12f\n"
+      "ldr h19, [x23], #0x2\n"
+      "ldr h18, [x21], #0x2\n"
+      "ldr h17, [x20], #0x2\n"
+      "ldr h16, [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[2], [x23]\n"
+      "ld1 { v18.b }[2], [x21]\n"
+      "ld1 { v17.b }[2], [x20]\n"
+      "ld1 { v16.b }[2], [x19]\n"
+      "b 13f\n"
+      "12:"  // odd_loads_1_0
+      "ldr b19, [x23, #0x0]\n"
+      "ldr b18, [x21, #0x0]\n"
+      "ldr b17, [x20, #0x0]\n"
+      "ldr b16, [x19, #0x0]\n"
+      "13:"  // Odd load end
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "uadalp v28.8h, v19.16b\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "uadalp v27.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "uadalp v26.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "uadalp v25.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "14:"  // Odds skip
+      "uadalp v24.4s, v28.8h\n"
+      "uadalp v23.4s, v27.8h\n"
+      "addp v24.4s, v24.4s, v23.4s\n"
+      "uadalp v22.4s, v26.8h\n"
+      "uadalp v21.4s, v25.8h\n"
+      "addp v23.4s, v22.4s, v21.4s\n"
+      "addp v24.4s, v24.4s, v23.4s\n"
+      "add v24.4s, v24.4s, v20.4s\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
new file mode 100644
index 0000000000..34d25f27b8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  float * &out_ptr, const bfloat16 * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "movi v29.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr d28, [x27], #0x8\n"
+      "zip1 v28.8h, v29.8h, v28.8h\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr d27, [x26], #0x8\n"
+      "zip1 v27.8h, v29.8h, v27.8h\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr d26, [x25], #0x8\n"
+      "zip1 v26.8h, v29.8h, v26.8h\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr d25, [x24], #0x8\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "zip1 v25.8h, v29.8h, v25.8h\n"
+      "ldr d24, [x23], #0x8\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "zip1 v24.8h, v29.8h, v24.8h\n"
+      "ldr d23, [x22], #0x8\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "zip1 v23.8h, v29.8h, v23.8h\n"
+      "ldr d22, [x21], #0x8\n"
+      "zip2 v19.4s, v20.4s, v19.4s\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "zip1 v22.8h, v29.8h, v22.8h\n"
+      "ldr d21, [x20], #0x8\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v21.8h, v29.8h, v21.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "subs %x[width], %x[width], #0x4\n"
+      "zip2 v20.4s, v28.4s, v26.4s\n"
+      "cmp %x[width], #0x4\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q19, [%x[out_ptr], #0x20]\n"
+      "zip2 v19.4s, v27.4s, v25.4s\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 6f\n"
+      "tbz %x[width], #1, 4f\n"
+      "ldr s28, [x27], #0x4\n"
+      "ldr s27, [x26], #0x4\n"
+      "ldr s26, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s23, [x22], #0x4\n"
+      "ldr s22, [x21], #0x4\n"
+      "ldr s21, [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 5f\n"
+      "ld1 { v28.h }[2], [x27]\n"
+      "ld1 { v27.h }[2], [x26]\n"
+      "ld1 { v26.h }[2], [x25]\n"
+      "ld1 { v25.h }[2], [x24]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "ld1 { v23.h }[2], [x22]\n"
+      "ld1 { v22.h }[2], [x21]\n"
+      "ld1 { v21.h }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 5f\n"
+      "4:"  // odd_loads_1_0
+      "ldr h28, [x27, #0x0]\n"
+      "ldr h27, [x26, #0x0]\n"
+      "ldr h26, [x25, #0x0]\n"
+      "ldr h25, [x24, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "ldr h23, [x22, #0x0]\n"
+      "ldr h22, [x21, #0x0]\n"
+      "ldr h21, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "5:"  // Odd load end
+      "zip1 v28.8h, v29.8h, v28.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v27.8h, v29.8h, v27.8h\n"
+      "zip1 v26.8h, v29.8h, v26.8h\n"
+      "zip1 v25.8h, v29.8h, v25.8h\n"
+      "zip1 v24.8h, v29.8h, v24.8h\n"
+      "zip1 v23.8h, v29.8h, v23.8h\n"
+      "zip1 v22.8h, v29.8h, v22.8h\n"
+      "zip1 v21.8h, v29.8h, v21.8h\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 6f\n"
+      "zip2 v19.4s, v20.4s, v19.4s\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 6f\n"
+      "zip2 v20.4s, v28.4s, v26.4s\n"
+      "zip2 v19.4s, v27.4s, v25.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "6:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
new file mode 100644
index 0000000000..d547957129
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  __fp16 * &out_ptr, const __fp16 * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q30, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q29, [x26], #0x10\n"
+      "ldr q28, [x25], #0x10\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q27, [x24], #0x10\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q24, [x23], #0x10\n"
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 8f\n"
+      "tbz %x[width], #2, 5f\n"
+      "ldr d30, [x27], #0x8\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d27, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v30.s }[2], [x27], #0x4\n"
+      "ld1 { v29.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v27.s }[2], [x24], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x20], #0x4\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.h }[6], [x27]\n"
+      "ld1 { v29.h }[6], [x26]\n"
+      "ld1 { v28.h }[6], [x25]\n"
+      "ld1 { v27.h }[6], [x24]\n"
+      "ld1 { v24.h }[6], [x23]\n"
+      "ld1 { v25.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
+      "ld1 { v22.h }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 7f\n"
+      "4:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.h }[4], [x27]\n"
+      "ld1 { v29.h }[4], [x26]\n"
+      "ld1 { v28.h }[4], [x25]\n"
+      "ld1 { v27.h }[4], [x24]\n"
+      "ld1 { v24.h }[4], [x23]\n"
+      "ld1 { v25.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
+      "ld1 { v22.h }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 7f\n"
+      "5:"  // odd_loads_2_0
+      "tbz %x[width], #1, 6f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.h }[2], [x27]\n"
+      "ld1 { v29.h }[2], [x26]\n"
+      "ld1 { v28.h }[2], [x25]\n"
+      "ld1 { v27.h }[2], [x24]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "ld1 { v25.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
+      "ld1 { v22.h }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 7f\n"
+      "6:"  // odd_loads_1_0
+      "ldr h30, [x27, #0x0]\n"
+      "ldr h29, [x26, #0x0]\n"
+      "ldr h28, [x25, #0x0]\n"
+      "ldr h27, [x24, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "ldr h25, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
+      "ldr h22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "7:"  // Odd load end
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v20.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "8:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
new file mode 100644
index 0000000000..b45e622a47
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  float * &out_ptr, const __fp16 * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr d29, [x27], #0x8\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr d26, [x24], #0x8\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "fcvtl v29.4s, v29.4h\n"
+      "fcvtl v28.4s, v28.4h\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "fcvtl v27.4s, v27.4h\n"
+      "zip1 v20.4s, v29.4s, v27.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "fcvtl v26.4s, v26.4h\n"
+      "zip2 v18.4s, v29.4s, v27.4s\n"
+      "fcvtl v25.4s, v25.4h\n"
+      "fcvtl v24.4s, v24.4h\n"
+      "zip1 v19.4s, v28.4s, v26.4s\n"
+      "fcvtl v23.4s, v23.4h\n"
+      "zip2 v17.4s, v28.4s, v26.4s\n"
+      "fcvtl v22.4s, v22.4h\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v21.4s, v20.4s, v19.4s\n"
+      "subs %x[width], %x[width], #0x4\n"
+      "zip1 v20.4s, v18.4s, v17.4s\n"
+      "cmp %x[width], #0x4\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      "zip1 v18.4s, v25.4s, v23.4s\n"
+      "zip1 v17.4s, v24.4s, v22.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q21, [%x[out_ptr], #0x20]\n"
+      "zip2 v18.4s, v25.4s, v23.4s\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v17.4s, v24.4s, v22.4s\n"
+      "str q20, [%x[out_ptr], #0x40]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q19, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 6f\n"
+      "tbz %x[width], #1, 4f\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 5f\n"
+      "ld1 { v29.h }[2], [x27]\n"
+      "ld1 { v28.h }[2], [x26]\n"
+      "ld1 { v27.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x24]\n"
+      "ld1 { v25.h }[2], [x23]\n"
+      "ld1 { v24.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
+      "ld1 { v22.h }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 5f\n"
+      "4:"  // odd_loads_1_0
+      "ldr h29, [x27, #0x0]\n"
+      "ldr h28, [x26, #0x0]\n"
+      "ldr h27, [x25, #0x0]\n"
+      "ldr h26, [x24, #0x0]\n"
+      "ldr h25, [x23, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
+      "ldr h22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "5:"  // Odd load end
+      "fcvtl v29.4s, v29.4h\n"
+      "fcvtl v28.4s, v28.4h\n"
+      "fcvtl v27.4s, v27.4h\n"
+      "zip1 v20.4s, v29.4s, v27.4s\n"
+      "fcvtl v26.4s, v26.4h\n"
+      "fcvtl v25.4s, v25.4h\n"
+      "zip1 v19.4s, v28.4s, v26.4s\n"
+      "fcvtl v24.4s, v24.4h\n"
+      "fcvtl v23.4s, v23.4h\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "fcvtl v22.4s, v22.4h\n"
+      "zip1 v18.4s, v25.4s, v23.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v17.4s, v24.4s, v22.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 6f\n"
+      "zip2 v21.4s, v20.4s, v19.4s\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q21, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 6f\n"
+      "zip2 v18.4s, v29.4s, v27.4s\n"
+      "zip2 v17.4s, v28.4s, v26.4s\n"
+      "zip1 v20.4s, v18.4s, v17.4s\n"
+      "str q20, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.4s, v25.4s, v23.4s\n"
+      "zip2 v17.4s, v24.4s, v22.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "6:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
new file mode 100644
index 0000000000..3f38859c1c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  float * &out_ptr, const float * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #2\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #2\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #2\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #2\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #2\n"
+      "add x22, x22, %x[row_offset], LSL #2\n"
+      "add x21, x21, %x[row_offset], LSL #2\n"
+      "add x20, x20, %x[row_offset], LSL #2\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q28, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q27, [x26], #0x10\n"
+      "ldr q26, [x25], #0x10\n"
+      "zip1 v23.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q22, [x24], #0x10\n"
+      "zip2 v26.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q25, [x23], #0x10\n"
+      "zip1 v20.4s, v27.4s, v22.4s\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q24, [x22], #0x10\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v23.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "ldr q21, [x20], #0x10\n"
+      "zip1 v18.4s, v25.4s, v19.4s\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.4s, v26.4s, v22.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v16.4s, v24.4s, v21.4s\n"
+      "subs %x[width], %x[width], #0x4\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "cmp %x[width], #0x4\n"
+      "zip2 v16.4s, v18.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "str q23, [%x[out_ptr], #0x20]\n"
+      "zip2 v18.4s, v24.4s, v21.4s\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q20, [%x[out_ptr], #0x40]\n"
+      "zip2 v17.4s, v26.4s, v22.4s\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 6f\n"
+      "tbz %x[width], #1, 4f\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d27, [x26], #0x8\n"
+      "ldr d26, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d21, [x20], #0x8\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 5f\n"
+      "ld1 { v28.s }[2], [x27]\n"
+      "ld1 { v27.s }[2], [x26]\n"
+      "ld1 { v26.s }[2], [x25]\n"
+      "ld1 { v22.s }[2], [x24]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v19.s }[2], [x21]\n"
+      "ld1 { v21.s }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 5f\n"
+      "4:"  // odd_loads_1_0
+      "ldr s28, [x27, #0x0]\n"
+      "ldr s27, [x26, #0x0]\n"
+      "ldr s26, [x25, #0x0]\n"
+      "ldr s22, [x24, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s19, [x21, #0x0]\n"
+      "ldr s21, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "5:"  // Odd load end
+      "zip1 v23.4s, v28.4s, v26.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v27.4s, v22.4s\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.4s, v25.4s, v19.4s\n"
+      "zip1 v16.4s, v24.4s, v21.4s\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 6f\n"
+      "zip2 v23.4s, v23.4s, v20.4s\n"
+      "zip2 v16.4s, v18.4s, v16.4s\n"
+      "str q23, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 6f\n"
+      "zip2 v26.4s, v28.4s, v26.4s\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "zip1 v20.4s, v26.4s, v22.4s\n"
+      "str q20, [%x[out_ptr], #0x0]\n"
+      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "zip2 v18.4s, v24.4s, v21.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "6:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
new file mode 100644
index 0000000000..03f552a575
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  int16_t * &out_ptr, const int16_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q30, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q29, [x26], #0x10\n"
+      "ldr q28, [x25], #0x10\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q27, [x24], #0x10\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q24, [x23], #0x10\n"
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 8f\n"
+      "tbz %x[width], #2, 5f\n"
+      "ldr d30, [x27], #0x8\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d27, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v30.s }[2], [x27], #0x4\n"
+      "ld1 { v29.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v27.s }[2], [x24], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x20], #0x4\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.h }[6], [x27]\n"
+      "ld1 { v29.h }[6], [x26]\n"
+      "ld1 { v28.h }[6], [x25]\n"
+      "ld1 { v27.h }[6], [x24]\n"
+      "ld1 { v24.h }[6], [x23]\n"
+      "ld1 { v25.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
+      "ld1 { v22.h }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 7f\n"
+      "4:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.h }[4], [x27]\n"
+      "ld1 { v29.h }[4], [x26]\n"
+      "ld1 { v28.h }[4], [x25]\n"
+      "ld1 { v27.h }[4], [x24]\n"
+      "ld1 { v24.h }[4], [x23]\n"
+      "ld1 { v25.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
+      "ld1 { v22.h }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 7f\n"
+      "5:"  // odd_loads_2_0
+      "tbz %x[width], #1, 6f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.h }[2], [x27]\n"
+      "ld1 { v29.h }[2], [x26]\n"
+      "ld1 { v28.h }[2], [x25]\n"
+      "ld1 { v27.h }[2], [x24]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "ld1 { v25.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
+      "ld1 { v22.h }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 7f\n"
+      "6:"  // odd_loads_1_0
+      "ldr h30, [x27, #0x0]\n"
+      "ldr h29, [x26, #0x0]\n"
+      "ldr h28, [x25, #0x0]\n"
+      "ldr h27, [x24, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "ldr h25, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
+      "ldr h22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "7:"  // Odd load end
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v20.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "8:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  uint16_t * &out_ptr, const uint16_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  int16_t * &out_cast = reinterpret_cast<int16_t * &>(out_ptr);
+  const int16_t * const * in_cast = reinterpret_cast<const int16_t * const *>(in);
+
+  interleave_block<8, 1, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false);
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
new file mode 100644
index 0000000000..35c7719de7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, true>(
+  int16_t * &out_ptr, const int16_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v1.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v0.4s }, [%x[out_ptr]]\n"
+      "ldr q31, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x8\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0xe\n"
+      "ble 4f\n"
+      "saddw v0.4s, v0.4s, v1.4h\n"
+      "saddw2 v31.4s, v31.4s, v1.8h\n"
+      "mov x19, #0x0\n"
+      "movi v1.8h, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q30, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q29, [x26], #0x10\n"
+      "ldr q28, [x25], #0x10\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q27, [x24], #0x10\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q24, [x23], #0x10\n"
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "add x19, x19, #0x1\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v20.8h, v17.8h\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.8h, v21.8h, v19.8h\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 10f\n"
+      "tbz %x[width], #2, 7f\n"
+      "ldr d30, [x27], #0x8\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d27, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v30.s }[2], [x27], #0x4\n"
+      "ld1 { v29.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v27.s }[2], [x24], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x20], #0x4\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.h }[6], [x27]\n"
+      "ld1 { v29.h }[6], [x26]\n"
+      "ld1 { v28.h }[6], [x25]\n"
+      "ld1 { v27.h }[6], [x24]\n"
+      "ld1 { v24.h }[6], [x23]\n"
+      "ld1 { v25.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
+      "ld1 { v22.h }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 9f\n"
+      "6:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.h }[4], [x27]\n"
+      "ld1 { v29.h }[4], [x26]\n"
+      "ld1 { v28.h }[4], [x25]\n"
+      "ld1 { v27.h }[4], [x24]\n"
+      "ld1 { v24.h }[4], [x23]\n"
+      "ld1 { v25.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
+      "ld1 { v22.h }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 9f\n"
+      "7:"  // odd_loads_2_0
+      "tbz %x[width], #1, 8f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.h }[2], [x27]\n"
+      "ld1 { v29.h }[2], [x26]\n"
+      "ld1 { v28.h }[2], [x25]\n"
+      "ld1 { v27.h }[2], [x24]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "ld1 { v25.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
+      "ld1 { v22.h }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 9f\n"
+      "8:"  // odd_loads_1_0
+      "ldr h30, [x27, #0x0]\n"
+      "ldr h29, [x26, #0x0]\n"
+      "ldr h28, [x25, #0x0]\n"
+      "ldr h27, [x24, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "ldr h25, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
+      "ldr h22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "9:"  // Odd load end
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v17.8h, v20.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip2 v16.8h, v21.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "10:"  // Odds skip
+      "saddw v0.4s, v0.4s, v1.4h\n"
+      "str q0, [%x[out_ptr], #0x0]\n"
+      "saddw2 v31.4s, v31.4s, v1.8h\n"
+      "str q31, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
new file mode 100644
index 0000000000..582836fe67
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  int16_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr d30, [x27], #0x8\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr d27, [x24], #0x8\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d25, [x20], #0x8\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "sshll v30.8h, v30.8b, #0x0\n"
+      "sshll v29.8h, v29.8b, #0x0\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "sshll v28.8h, v28.8b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "sshll v27.8h, v27.8b, #0x0\n"
+      "sshll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "sshll v21.8h, v21.8b, #0x0\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "sshll v26.8h, v26.8b, #0x0\n"
+      "sshll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 8f\n"
+      "tbz %x[width], #2, 5f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s21, [x22], #0x4\n"
+      "ldr s26, [x21], #0x4\n"
+      "ldr s25, [x20], #0x4\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v30.h }[2], [x27], #0x2\n"
+      "ld1 { v29.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v27.h }[2], [x24], #0x2\n"
+      "ld1 { v23.h }[2], [x23], #0x2\n"
+      "ld1 { v21.h }[2], [x22], #0x2\n"
+      "ld1 { v26.h }[2], [x21], #0x2\n"
+      "ld1 { v25.h }[2], [x20], #0x2\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.b }[6], [x27]\n"
+      "ld1 { v29.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v27.b }[6], [x24]\n"
+      "ld1 { v23.b }[6], [x23]\n"
+      "ld1 { v21.b }[6], [x22]\n"
+      "ld1 { v26.b }[6], [x21]\n"
+      "ld1 { v25.b }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 7f\n"
+      "4:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.b }[4], [x27]\n"
+      "ld1 { v29.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v27.b }[4], [x24]\n"
+      "ld1 { v23.b }[4], [x23]\n"
+      "ld1 { v21.b }[4], [x22]\n"
+      "ld1 { v26.b }[4], [x21]\n"
+      "ld1 { v25.b }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 7f\n"
+      "5:"  // odd_loads_2_0
+      "tbz %x[width], #1, 6f\n"
+      "ldr h30, [x27], #0x2\n"
+      "ldr h29, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h27, [x24], #0x2\n"
+      "ldr h23, [x23], #0x2\n"
+      "ldr h21, [x22], #0x2\n"
+      "ldr h26, [x21], #0x2\n"
+      "ldr h25, [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.b }[2], [x27]\n"
+      "ld1 { v29.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v27.b }[2], [x24]\n"
+      "ld1 { v23.b }[2], [x23]\n"
+      "ld1 { v21.b }[2], [x22]\n"
+      "ld1 { v26.b }[2], [x21]\n"
+      "ld1 { v25.b }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 7f\n"
+      "6:"  // odd_loads_1_0
+      "ldr b30, [x27, #0x0]\n"
+      "ldr b29, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b27, [x24, #0x0]\n"
+      "ldr b23, [x23, #0x0]\n"
+      "ldr b21, [x22, #0x0]\n"
+      "ldr b26, [x21, #0x0]\n"
+      "ldr b25, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "7:"  // Odd load end
+      "sshll v30.8h, v30.8b, #0x0\n"
+      "sshll v29.8h, v29.8b, #0x0\n"
+      "sshll v28.8h, v28.8b, #0x0\n"
+      "sshll v27.8h, v27.8b, #0x0\n"
+      "sshll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "sshll v21.8h, v21.8b, #0x0\n"
+      "sshll v26.8h, v26.8b, #0x0\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "sshll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "8:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
new file mode 100644
index 0000000000..35dc3dc0d4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, true>(
+  int16_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v1.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v0.4s }, [%x[out_ptr]]\n"
+      "ldr q31, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x8\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0xe\n"
+      "ble 4f\n"
+      "saddw v0.4s, v0.4s, v1.4h\n"
+      "saddw2 v31.4s, v31.4s, v1.8h\n"
+      "mov x19, #0x0\n"
+      "movi v1.8h, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr d30, [x27], #0x8\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr d27, [x24], #0x8\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d25, [x20], #0x8\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "sshll v30.8h, v30.8b, #0x0\n"
+      "sshll v29.8h, v29.8b, #0x0\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "sshll v28.8h, v28.8b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "sshll v27.8h, v27.8b, #0x0\n"
+      "sshll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "sshll v21.8h, v21.8b, #0x0\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "sshll v26.8h, v26.8b, #0x0\n"
+      "sshll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "add x19, x19, #0x1\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 10f\n"
+      "tbz %x[width], #2, 7f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s21, [x22], #0x4\n"
+      "ldr s26, [x21], #0x4\n"
+      "ldr s25, [x20], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v30.h }[2], [x27], #0x2\n"
+      "ld1 { v29.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v27.h }[2], [x24], #0x2\n"
+      "ld1 { v23.h }[2], [x23], #0x2\n"
+      "ld1 { v21.h }[2], [x22], #0x2\n"
+      "ld1 { v26.h }[2], [x21], #0x2\n"
+      "ld1 { v25.h }[2], [x20], #0x2\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.b }[6], [x27]\n"
+      "ld1 { v29.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v27.b }[6], [x24]\n"
+      "ld1 { v23.b }[6], [x23]\n"
+      "ld1 { v21.b }[6], [x22]\n"
+      "ld1 { v26.b }[6], [x21]\n"
+      "ld1 { v25.b }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 9f\n"
+      "6:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.b }[4], [x27]\n"
+      "ld1 { v29.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v27.b }[4], [x24]\n"
+      "ld1 { v23.b }[4], [x23]\n"
+      "ld1 { v21.b }[4], [x22]\n"
+      "ld1 { v26.b }[4], [x21]\n"
+      "ld1 { v25.b }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 9f\n"
+      "7:"  // odd_loads_2_0
+      "tbz %x[width], #1, 8f\n"
+      "ldr h30, [x27], #0x2\n"
+      "ldr h29, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h27, [x24], #0x2\n"
+      "ldr h23, [x23], #0x2\n"
+      "ldr h21, [x22], #0x2\n"
+      "ldr h26, [x21], #0x2\n"
+      "ldr h25, [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.b }[2], [x27]\n"
+      "ld1 { v29.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v27.b }[2], [x24]\n"
+      "ld1 { v23.b }[2], [x23]\n"
+      "ld1 { v21.b }[2], [x22]\n"
+      "ld1 { v26.b }[2], [x21]\n"
+      "ld1 { v25.b }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 9f\n"
+      "8:"  // odd_loads_1_0
+      "ldr b30, [x27, #0x0]\n"
+      "ldr b29, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b27, [x24, #0x0]\n"
+      "ldr b23, [x23, #0x0]\n"
+      "ldr b21, [x22, #0x0]\n"
+      "ldr b26, [x21, #0x0]\n"
+      "ldr b25, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "9:"  // Odd load end
+      "sshll v30.8h, v30.8b, #0x0\n"
+      "sshll v29.8h, v29.8b, #0x0\n"
+      "sshll v28.8h, v28.8b, #0x0\n"
+      "sshll v27.8h, v27.8b, #0x0\n"
+      "sshll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "sshll v21.8h, v21.8b, #0x0\n"
+      "sshll v26.8h, v26.8b, #0x0\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "sshll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "10:"  // Odds skip
+      "saddw v0.4s, v0.4s, v1.4h\n"
+      "str q0, [%x[out_ptr], #0x0]\n"
+      "saddw2 v31.4s, v31.4s, v1.8h\n"
+      "str q31, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
new file mode 100644
index 0000000000..bfa8989a4d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, true>(
+  uint16_t * &out_ptr, const uint16_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v1.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v0.4s }, [%x[out_ptr]]\n"
+      "ldr q31, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x8\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0xe\n"
+      "ble 4f\n"
+      "uaddw v0.4s, v0.4s, v1.4h\n"
+      "uaddw2 v31.4s, v31.4s, v1.8h\n"
+      "mov x19, #0x0\n"
+      "movi v1.8h, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q30, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q29, [x26], #0x10\n"
+      "ldr q28, [x25], #0x10\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q27, [x24], #0x10\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q24, [x23], #0x10\n"
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "add x19, x19, #0x1\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v20.8h, v17.8h\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.8h, v21.8h, v19.8h\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 10f\n"
+      "tbz %x[width], #2, 7f\n"
+      "ldr d30, [x27], #0x8\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d27, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v30.s }[2], [x27], #0x4\n"
+      "ld1 { v29.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v27.s }[2], [x24], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x20], #0x4\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.h }[6], [x27]\n"
+      "ld1 { v29.h }[6], [x26]\n"
+      "ld1 { v28.h }[6], [x25]\n"
+      "ld1 { v27.h }[6], [x24]\n"
+      "ld1 { v24.h }[6], [x23]\n"
+      "ld1 { v25.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
+      "ld1 { v22.h }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 9f\n"
+      "6:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.h }[4], [x27]\n"
+      "ld1 { v29.h }[4], [x26]\n"
+      "ld1 { v28.h }[4], [x25]\n"
+      "ld1 { v27.h }[4], [x24]\n"
+      "ld1 { v24.h }[4], [x23]\n"
+      "ld1 { v25.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
+      "ld1 { v22.h }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 9f\n"
+      "7:"  // odd_loads_2_0
+      "tbz %x[width], #1, 8f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.h }[2], [x27]\n"
+      "ld1 { v29.h }[2], [x26]\n"
+      "ld1 { v28.h }[2], [x25]\n"
+      "ld1 { v27.h }[2], [x24]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "ld1 { v25.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
+      "ld1 { v22.h }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 9f\n"
+      "8:"  // odd_loads_1_0
+      "ldr h30, [x27, #0x0]\n"
+      "ldr h29, [x26, #0x0]\n"
+      "ldr h28, [x25, #0x0]\n"
+      "ldr h27, [x24, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "ldr h25, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
+      "ldr h22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "9:"  // Odd load end
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v17.8h, v20.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip2 v16.8h, v21.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "10:"  // Odds skip
+      "uaddw v0.4s, v0.4s, v1.4h\n"
+      "str q0, [%x[out_ptr], #0x0]\n"
+      "uaddw2 v31.4s, v31.4s, v1.8h\n"
+      "str q31, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
new file mode 100644
index 0000000000..86b90f1898
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  uint16_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr d30, [x27], #0x8\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr d27, [x24], #0x8\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d25, [x20], #0x8\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ushll v30.8h, v30.8b, #0x0\n"
+      "ushll v29.8h, v29.8b, #0x0\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ushll v28.8h, v28.8b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "ushll v27.8h, v27.8b, #0x0\n"
+      "ushll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "ushll v21.8h, v21.8b, #0x0\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "ushll v26.8h, v26.8b, #0x0\n"
+      "ushll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 8f\n"
+      "tbz %x[width], #2, 5f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s21, [x22], #0x4\n"
+      "ldr s26, [x21], #0x4\n"
+      "ldr s25, [x20], #0x4\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v30.h }[2], [x27], #0x2\n"
+      "ld1 { v29.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v27.h }[2], [x24], #0x2\n"
+      "ld1 { v23.h }[2], [x23], #0x2\n"
+      "ld1 { v21.h }[2], [x22], #0x2\n"
+      "ld1 { v26.h }[2], [x21], #0x2\n"
+      "ld1 { v25.h }[2], [x20], #0x2\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.b }[6], [x27]\n"
+      "ld1 { v29.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v27.b }[6], [x24]\n"
+      "ld1 { v23.b }[6], [x23]\n"
+      "ld1 { v21.b }[6], [x22]\n"
+      "ld1 { v26.b }[6], [x21]\n"
+      "ld1 { v25.b }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 7f\n"
+      "4:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.b }[4], [x27]\n"
+      "ld1 { v29.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v27.b }[4], [x24]\n"
+      "ld1 { v23.b }[4], [x23]\n"
+      "ld1 { v21.b }[4], [x22]\n"
+      "ld1 { v26.b }[4], [x21]\n"
+      "ld1 { v25.b }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 7f\n"
+      "5:"  // odd_loads_2_0
+      "tbz %x[width], #1, 6f\n"
+      "ldr h30, [x27], #0x2\n"
+      "ldr h29, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h27, [x24], #0x2\n"
+      "ldr h23, [x23], #0x2\n"
+      "ldr h21, [x22], #0x2\n"
+      "ldr h26, [x21], #0x2\n"
+      "ldr h25, [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.b }[2], [x27]\n"
+      "ld1 { v29.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v27.b }[2], [x24]\n"
+      "ld1 { v23.b }[2], [x23]\n"
+      "ld1 { v21.b }[2], [x22]\n"
+      "ld1 { v26.b }[2], [x21]\n"
+      "ld1 { v25.b }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 7f\n"
+      "6:"  // odd_loads_1_0
+      "ldr b30, [x27, #0x0]\n"
+      "ldr b29, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b27, [x24, #0x0]\n"
+      "ldr b23, [x23, #0x0]\n"
+      "ldr b21, [x22, #0x0]\n"
+      "ldr b26, [x21, #0x0]\n"
+      "ldr b25, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "7:"  // Odd load end
+      "ushll v30.8h, v30.8b, #0x0\n"
+      "ushll v29.8h, v29.8b, #0x0\n"
+      "ushll v28.8h, v28.8b, #0x0\n"
+      "ushll v27.8h, v27.8b, #0x0\n"
+      "ushll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "ushll v21.8h, v21.8b, #0x0\n"
+      "ushll v26.8h, v26.8b, #0x0\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "ushll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "8:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
new file mode 100644
index 0000000000..cefb70c57b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, true>(
+  uint16_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v1.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v0.4s }, [%x[out_ptr]]\n"
+      "ldr q31, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x8\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0xe\n"
+      "ble 4f\n"
+      "uaddw v0.4s, v0.4s, v1.4h\n"
+      "uaddw2 v31.4s, v31.4s, v1.8h\n"
+      "mov x19, #0x0\n"
+      "movi v1.8h, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr d30, [x27], #0x8\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr d27, [x24], #0x8\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d25, [x20], #0x8\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ushll v30.8h, v30.8b, #0x0\n"
+      "ushll v29.8h, v29.8b, #0x0\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ushll v28.8h, v28.8b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "ushll v27.8h, v27.8b, #0x0\n"
+      "ushll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "ushll v21.8h, v21.8b, #0x0\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "ushll v26.8h, v26.8b, #0x0\n"
+      "ushll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "add x19, x19, #0x1\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 10f\n"
+      "tbz %x[width], #2, 7f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s21, [x22], #0x4\n"
+      "ldr s26, [x21], #0x4\n"
+      "ldr s25, [x20], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v30.h }[2], [x27], #0x2\n"
+      "ld1 { v29.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v27.h }[2], [x24], #0x2\n"
+      "ld1 { v23.h }[2], [x23], #0x2\n"
+      "ld1 { v21.h }[2], [x22], #0x2\n"
+      "ld1 { v26.h }[2], [x21], #0x2\n"
+      "ld1 { v25.h }[2], [x20], #0x2\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.b }[6], [x27]\n"
+      "ld1 { v29.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v27.b }[6], [x24]\n"
+      "ld1 { v23.b }[6], [x23]\n"
+      "ld1 { v21.b }[6], [x22]\n"
+      "ld1 { v26.b }[6], [x21]\n"
+      "ld1 { v25.b }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 9f\n"
+      "6:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.b }[4], [x27]\n"
+      "ld1 { v29.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v27.b }[4], [x24]\n"
+      "ld1 { v23.b }[4], [x23]\n"
+      "ld1 { v21.b }[4], [x22]\n"
+      "ld1 { v26.b }[4], [x21]\n"
+      "ld1 { v25.b }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 9f\n"
+      "7:"  // odd_loads_2_0
+      "tbz %x[width], #1, 8f\n"
+      "ldr h30, [x27], #0x2\n"
+      "ldr h29, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h27, [x24], #0x2\n"
+      "ldr h23, [x23], #0x2\n"
+      "ldr h21, [x22], #0x2\n"
+      "ldr h26, [x21], #0x2\n"
+      "ldr h25, [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.b }[2], [x27]\n"
+      "ld1 { v29.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v27.b }[2], [x24]\n"
+      "ld1 { v23.b }[2], [x23]\n"
+      "ld1 { v21.b }[2], [x22]\n"
+      "ld1 { v26.b }[2], [x21]\n"
+      "ld1 { v25.b }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 9f\n"
+      "8:"  // odd_loads_1_0
+      "ldr b30, [x27, #0x0]\n"
+      "ldr b29, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b27, [x24, #0x0]\n"
+      "ldr b23, [x23, #0x0]\n"
+      "ldr b21, [x22, #0x0]\n"
+      "ldr b26, [x21, #0x0]\n"
+      "ldr b25, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "9:"  // Odd load end
+      "ushll v30.8h, v30.8b, #0x0\n"
+      "ushll v29.8h, v29.8b, #0x0\n"
+      "ushll v28.8h, v28.8b, #0x0\n"
+      "ushll v27.8h, v27.8b, #0x0\n"
+      "ushll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "ushll v21.8h, v21.8b, #0x0\n"
+      "ushll v26.8h, v26.8b, #0x0\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "ushll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "10:"  // Odds skip
+      "uaddw v0.4s, v0.4s, v1.4h\n"
+      "str q0, [%x[out_ptr], #0x0]\n"
+      "uaddw2 v31.4s, v31.4s, v1.8h\n"
+      "str q31, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
new file mode 100644
index 0000000000..5377edc1e1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 2, VLType::None, false>(
+  bfloat16 * &out_ptr, const bfloat16 * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q28, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q27, [x26], #0x10\n"
+      "ldr q26, [x25], #0x10\n"
+      "zip1 v23.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q22, [x24], #0x10\n"
+      "zip2 v26.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q25, [x23], #0x10\n"
+      "zip1 v20.4s, v27.4s, v22.4s\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q24, [x22], #0x10\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v23.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "ldr q21, [x20], #0x10\n"
+      "zip1 v18.4s, v25.4s, v19.4s\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.4s, v26.4s, v22.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v16.4s, v24.4s, v21.4s\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "cmp %x[width], #0x8\n"
+      "zip2 v16.4s, v18.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "str q23, [%x[out_ptr], #0x20]\n"
+      "zip2 v18.4s, v24.4s, v21.4s\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q20, [%x[out_ptr], #0x40]\n"
+      "zip2 v17.4s, v26.4s, v22.4s\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 8f\n"
+      "tbz %x[width], #2, 5f\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d27, [x26], #0x8\n"
+      "ldr d26, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d21, [x20], #0x8\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v28.s }[2], [x27], #0x4\n"
+      "ld1 { v27.s }[2], [x26], #0x4\n"
+      "ld1 { v26.s }[2], [x25], #0x4\n"
+      "ld1 { v22.s }[2], [x24], #0x4\n"
+      "ld1 { v25.s }[2], [x23], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v21.s }[2], [x20], #0x4\n"
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v28.h }[6], [x27]\n"
+      "ld1 { v27.h }[6], [x26]\n"
+      "ld1 { v26.h }[6], [x25]\n"
+      "ld1 { v22.h }[6], [x24]\n"
+      "ld1 { v25.h }[6], [x23]\n"
+      "ld1 { v24.h }[6], [x22]\n"
+      "ld1 { v19.h }[6], [x21]\n"
+      "ld1 { v21.h }[6], [x20]\n"
+      "mov x19, #0x4\n"
+      "b 7f\n"
+      "4:"  // odd_loads_1_4
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v28.h }[4], [x27]\n"
+      "ld1 { v27.h }[4], [x26]\n"
+      "ld1 { v26.h }[4], [x25]\n"
+      "ld1 { v22.h }[4], [x24]\n"
+      "ld1 { v25.h }[4], [x23]\n"
+      "ld1 { v24.h }[4], [x22]\n"
+      "ld1 { v19.h }[4], [x21]\n"
+      "ld1 { v21.h }[4], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 7f\n"
+      "5:"  // odd_loads_2_0
+      "tbz %x[width], #1, 6f\n"
+      "ldr s28, [x27], #0x4\n"
+      "ldr s27, [x26], #0x4\n"
+      "ldr s26, [x25], #0x4\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s21, [x20], #0x4\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v28.h }[2], [x27]\n"
+      "ld1 { v27.h }[2], [x26]\n"
+      "ld1 { v26.h }[2], [x25]\n"
+      "ld1 { v22.h }[2], [x24]\n"
+      "ld1 { v25.h }[2], [x23]\n"
+      "ld1 { v24.h }[2], [x22]\n"
+      "ld1 { v19.h }[2], [x21]\n"
+      "ld1 { v21.h }[2], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 7f\n"
+      "6:"  // odd_loads_1_0
+      "ldr h28, [x27, #0x0]\n"
+      "ldr h27, [x26, #0x0]\n"
+      "ldr h26, [x25, #0x0]\n"
+      "ldr h22, [x24, #0x0]\n"
+      "ldr h25, [x23, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
+      "ldr h19, [x21, #0x0]\n"
+      "ldr h21, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "7:"  // Odd load end
+      "zip1 v23.4s, v28.4s, v26.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v27.4s, v22.4s\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.4s, v25.4s, v19.4s\n"
+      "zip1 v16.4s, v24.4s, v21.4s\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 8f\n"
+      "zip2 v23.4s, v23.4s, v20.4s\n"
+      "zip2 v16.4s, v18.4s, v16.4s\n"
+      "str q23, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 8f\n"
+      "zip2 v26.4s, v28.4s, v26.4s\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v26.4s, v22.4s\n"
+      "str q20, [%x[out_ptr], #0x0]\n"
+      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "zip2 v18.4s, v24.4s, v21.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 8f\n"
+      "zip2 v17.4s, v26.4s, v22.4s\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "8:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
new file mode 100644
index 0000000000..3aea6a8999
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 2, VLType::None, false>(
+  float * &out_ptr, const float * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #2\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #2\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #2\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #2\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #2\n"
+      "add x22, x22, %x[row_offset], LSL #2\n"
+      "add x21, x21, %x[row_offset], LSL #2\n"
+      "add x20, x20, %x[row_offset], LSL #2\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q27, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q25, [x25], #0x10\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "str q24, [%x[out_ptr], #0x40]\n"
+      "str q21, [%x[out_ptr], #0x50]\n"
+      "str q18, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "subs %x[width], %x[width], #0x4\n"
+      "cmp %x[width], #0x4\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 6f\n"
+      "tbz %x[width], #1, 4f\n"
+      "ldr d27, [x27], #0x8\n"
+      "ldr d24, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 5f\n"
+      "ld1 { v27.s }[2], [x27]\n"
+      "ld1 { v24.s }[2], [x26]\n"
+      "ld1 { v25.s }[2], [x25]\n"
+      "ld1 { v21.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "ld1 { v19.s }[2], [x21]\n"
+      "ld1 { v16.s }[2], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 5f\n"
+      "4:"  // odd_loads_1_0
+      "ldr s27, [x27, #0x0]\n"
+      "ldr s24, [x26, #0x0]\n"
+      "ldr s25, [x25, #0x0]\n"
+      "ldr s21, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "ldr s19, [x21, #0x0]\n"
+      "ldr s16, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "5:"  // Odd load end
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "beq 6f\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "str q21, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q18, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "6:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
new file mode 100644
index 0000000000..4780b77a4a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, false>(
+  bfloat16 * &out_ptr, const bfloat16 * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q27, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q25, [x25], #0x10\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "str q24, [%x[out_ptr], #0x40]\n"
+      "str q21, [%x[out_ptr], #0x50]\n"
+      "str q18, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "cmp %x[width], #0x8\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 8f\n"
+      "tbz %x[width], #2, 5f\n"
+      "ldr d27, [x27], #0x8\n"
+      "ldr d24, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v27.s }[2], [x27], #0x4\n"
+      "ld1 { v24.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v27.h }[6], [x27]\n"
+      "ld1 { v24.h }[6], [x26]\n"
+      "ld1 { v25.h }[6], [x25]\n"
+      "ld1 { v21.h }[6], [x24]\n"
+      "ld1 { v22.h }[6], [x23]\n"
+      "ld1 { v18.h }[6], [x22]\n"
+      "ld1 { v19.h }[6], [x21]\n"
+      "ld1 { v16.h }[6], [x20]\n"
+      "b 7f\n"
+      "4:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v27.h }[4], [x27]\n"
+      "ld1 { v24.h }[4], [x26]\n"
+      "ld1 { v25.h }[4], [x25]\n"
+      "ld1 { v21.h }[4], [x24]\n"
+      "ld1 { v22.h }[4], [x23]\n"
+      "ld1 { v18.h }[4], [x22]\n"
+      "ld1 { v19.h }[4], [x21]\n"
+      "ld1 { v16.h }[4], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 7f\n"
+      "5:"  // odd_loads_2_0
+      "tbz %x[width], #1, 6f\n"
+      "ldr s27, [x27], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s18, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v27.h }[2], [x27]\n"
+      "ld1 { v24.h }[2], [x26]\n"
+      "ld1 { v25.h }[2], [x25]\n"
+      "ld1 { v21.h }[2], [x24]\n"
+      "ld1 { v22.h }[2], [x23]\n"
+      "ld1 { v18.h }[2], [x22]\n"
+      "ld1 { v19.h }[2], [x21]\n"
+      "ld1 { v16.h }[2], [x20]\n"
+      "b 7f\n"
+      "6:"  // odd_loads_1_0
+      "ldr h27, [x27, #0x0]\n"
+      "ldr h24, [x26, #0x0]\n"
+      "ldr h25, [x25, #0x0]\n"
+      "ldr h21, [x24, #0x0]\n"
+      "ldr h22, [x23, #0x0]\n"
+      "ldr h18, [x22, #0x0]\n"
+      "ldr h19, [x21, #0x0]\n"
+      "ldr h16, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "7:"  // Odd load end
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "beq 8f\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "str q21, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q18, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "8:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
new file mode 100644
index 0000000000..a9034f5742
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, false>(
+  int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x10\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q28, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q27, [x26], #0x10\n"
+      "ldr q26, [x25], #0x10\n"
+      "zip1 v23.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q22, [x24], #0x10\n"
+      "zip2 v26.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q25, [x23], #0x10\n"
+      "zip1 v20.4s, v27.4s, v22.4s\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q24, [x22], #0x10\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v23.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "ldr q21, [x20], #0x10\n"
+      "zip1 v18.4s, v25.4s, v19.4s\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.4s, v26.4s, v22.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v16.4s, v24.4s, v21.4s\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "cmp %x[width], #0x10\n"
+      "zip2 v16.4s, v18.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "str q23, [%x[out_ptr], #0x20]\n"
+      "zip2 v18.4s, v24.4s, v21.4s\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q20, [%x[out_ptr], #0x40]\n"
+      "zip2 v17.4s, v26.4s, v22.4s\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 12f\n"
+      "tbz %x[width], #3, 7f\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d27, [x26], #0x8\n"
+      "ldr d26, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d21, [x20], #0x8\n"
+      "tbz %x[width], #2, 5f\n"
+      "ld1 { v28.s }[2], [x27], #0x4\n"
+      "ld1 { v27.s }[2], [x26], #0x4\n"
+      "ld1 { v26.s }[2], [x25], #0x4\n"
+      "ld1 { v22.s }[2], [x24], #0x4\n"
+      "ld1 { v25.s }[2], [x23], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v21.s }[2], [x20], #0x4\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v28.h }[6], [x27], #0x2\n"
+      "ld1 { v27.h }[6], [x26], #0x2\n"
+      "ld1 { v26.h }[6], [x25], #0x2\n"
+      "ld1 { v22.h }[6], [x24], #0x2\n"
+      "ld1 { v25.h }[6], [x23], #0x2\n"
+      "ld1 { v24.h }[6], [x22], #0x2\n"
+      "ld1 { v19.h }[6], [x21], #0x2\n"
+      "ld1 { v21.h }[6], [x20], #0x2\n"
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[14], [x27]\n"
+      "ld1 { v27.b }[14], [x26]\n"
+      "ld1 { v26.b }[14], [x25]\n"
+      "ld1 { v22.b }[14], [x24]\n"
+      "ld1 { v25.b }[14], [x23]\n"
+      "ld1 { v24.b }[14], [x22]\n"
+      "ld1 { v19.b }[14], [x21]\n"
+      "ld1 { v21.b }[14], [x20]\n"
+      "b 11f\n"
+      "4:"  // odd_loads_1_12
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[12], [x27]\n"
+      "ld1 { v27.b }[12], [x26]\n"
+      "ld1 { v26.b }[12], [x25]\n"
+      "ld1 { v22.b }[12], [x24]\n"
+      "ld1 { v25.b }[12], [x23]\n"
+      "ld1 { v24.b }[12], [x22]\n"
+      "ld1 { v19.b }[12], [x21]\n"
+      "ld1 { v21.b }[12], [x20]\n"
+      "mov x19, #0x4\n"
+      "b 11f\n"
+      "5:"  // odd_loads_2_8
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v28.h }[4], [x27], #0x2\n"
+      "ld1 { v27.h }[4], [x26], #0x2\n"
+      "ld1 { v26.h }[4], [x25], #0x2\n"
+      "ld1 { v22.h }[4], [x24], #0x2\n"
+      "ld1 { v25.h }[4], [x23], #0x2\n"
+      "ld1 { v24.h }[4], [x22], #0x2\n"
+      "ld1 { v19.h }[4], [x21], #0x2\n"
+      "ld1 { v21.h }[4], [x20], #0x2\n"
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[10], [x27]\n"
+      "ld1 { v27.b }[10], [x26]\n"
+      "ld1 { v26.b }[10], [x25]\n"
+      "ld1 { v22.b }[10], [x24]\n"
+      "ld1 { v25.b }[10], [x23]\n"
+      "ld1 { v24.b }[10], [x22]\n"
+      "ld1 { v19.b }[10], [x21]\n"
+      "ld1 { v21.b }[10], [x20]\n"
+      "b 11f\n"
+      "6:"  // odd_loads_1_8
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[8], [x27]\n"
+      "ld1 { v27.b }[8], [x26]\n"
+      "ld1 { v26.b }[8], [x25]\n"
+      "ld1 { v22.b }[8], [x24]\n"
+      "ld1 { v25.b }[8], [x23]\n"
+      "ld1 { v24.b }[8], [x22]\n"
+      "ld1 { v19.b }[8], [x21]\n"
+      "ld1 { v21.b }[8], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 11f\n"
+      "7:"  // odd_loads_4_0
+      "tbz %x[width], #2, 9f\n"
+      "ldr s28, [x27], #0x4\n"
+      "ldr s27, [x26], #0x4\n"
+      "ldr s26, [x25], #0x4\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s21, [x20], #0x4\n"
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v28.h }[2], [x27], #0x2\n"
+      "ld1 { v27.h }[2], [x26], #0x2\n"
+      "ld1 { v26.h }[2], [x25], #0x2\n"
+      "ld1 { v22.h }[2], [x24], #0x2\n"
+      "ld1 { v25.h }[2], [x23], #0x2\n"
+      "ld1 { v24.h }[2], [x22], #0x2\n"
+      "ld1 { v19.h }[2], [x21], #0x2\n"
+      "ld1 { v21.h }[2], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[6], [x27]\n"
+      "ld1 { v27.b }[6], [x26]\n"
+      "ld1 { v26.b }[6], [x25]\n"
+      "ld1 { v22.b }[6], [x24]\n"
+      "ld1 { v25.b }[6], [x23]\n"
+      "ld1 { v24.b }[6], [x22]\n"
+      "ld1 { v19.b }[6], [x21]\n"
+      "ld1 { v21.b }[6], [x20]\n"
+      "b 11f\n"
+      "8:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[4], [x27]\n"
+      "ld1 { v27.b }[4], [x26]\n"
+      "ld1 { v26.b }[4], [x25]\n"
+      "ld1 { v22.b }[4], [x24]\n"
+      "ld1 { v25.b }[4], [x23]\n"
+      "ld1 { v24.b }[4], [x22]\n"
+      "ld1 { v19.b }[4], [x21]\n"
+      "ld1 { v21.b }[4], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 11f\n"
+      "9:"  // odd_loads_2_0
+      "tbz %x[width], #1, 10f\n"
+      "ldr h28, [x27], #0x2\n"
+      "ldr h27, [x26], #0x2\n"
+      "ldr h26, [x25], #0x2\n"
+      "ldr h22, [x24], #0x2\n"
+      "ldr h25, [x23], #0x2\n"
+      "ldr h24, [x22], #0x2\n"
+      "ldr h19, [x21], #0x2\n"
+      "ldr h21, [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[2], [x27]\n"
+      "ld1 { v27.b }[2], [x26]\n"
+      "ld1 { v26.b }[2], [x25]\n"
+      "ld1 { v22.b }[2], [x24]\n"
+      "ld1 { v25.b }[2], [x23]\n"
+      "ld1 { v24.b }[2], [x22]\n"
+      "ld1 { v19.b }[2], [x21]\n"
+      "ld1 { v21.b }[2], [x20]\n"
+      "b 11f\n"
+      "10:"  // odd_loads_1_0
+      "ldr b28, [x27, #0x0]\n"
+      "ldr b27, [x26, #0x0]\n"
+      "ldr b26, [x25, #0x0]\n"
+      "ldr b22, [x24, #0x0]\n"
+      "ldr b25, [x23, #0x0]\n"
+      "ldr b24, [x22, #0x0]\n"
+      "ldr b19, [x21, #0x0]\n"
+      "ldr b21, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "11:"  // Odd load end
+      "zip1 v23.4s, v28.4s, v26.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v27.4s, v22.4s\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.4s, v25.4s, v19.4s\n"
+      "zip1 v16.4s, v24.4s, v21.4s\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 12f\n"
+      "zip2 v23.4s, v23.4s, v20.4s\n"
+      "zip2 v16.4s, v18.4s, v16.4s\n"
+      "str q23, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 12f\n"
+      "zip2 v26.4s, v28.4s, v26.4s\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v26.4s, v22.4s\n"
+      "str q20, [%x[out_ptr], #0x0]\n"
+      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "zip2 v18.4s, v24.4s, v21.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 12f\n"
+      "zip2 v17.4s, v26.4s, v22.4s\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "12:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+template<>
+void interleave_block<8, 4, VLType::None, false>(
+  uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  int8_t * &out_cast = reinterpret_cast<int8_t * &>(out_ptr);
+  const int8_t * const * in_cast = reinterpret_cast<const int8_t * const *>(in);
+
+  interleave_block<8, 4, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false);
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
new file mode 100644
index 0000000000..2831cb79a6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, true>(
+  int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v1.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v0.8h, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "movi v30.4s, #0x0\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v31.4s }, [%x[out_ptr]]\n"
+      "ldr q30, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x10\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0x1e\n"
+      "ble 4f\n"
+      "sadalp v31.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "sadalp v30.4s, v0.8h\n"
+      "movi v0.8h, #0x0\n"
+      "mov x19, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q29, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q28, [x26], #0x10\n"
+      "ldr q27, [x25], #0x10\n"
+      "zip1 v23.4s, v29.4s, v27.4s\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip2 v27.4s, v29.4s, v27.4s\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q26, [x23], #0x10\n"
+      "zip1 v20.4s, v28.4s, v21.4s\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v24.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "zip2 v23.4s, v28.4s, v21.4s\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v18.4s, v26.4s, v19.4s\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v21.4s, v27.4s, v23.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v17.4s, v25.4s, v22.4s\n"
+      "sadalp v1.8h, v16.16b\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "add x19, x19, #0x1\n"
+      "zip2 v20.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.4s, v26.4s, v19.4s\n"
+      "sadalp v0.8h, v16.16b\n"
+      "zip2 v16.4s, v25.4s, v22.4s\n"
+      "str q24, [%x[out_ptr], #0x20]\n"
+      "zip1 v18.4s, v19.4s, v16.4s\n"
+      "sadalp v1.8h, v24.16b\n"
+      "zip2 v17.4s, v27.4s, v23.4s\n"
+      "str q20, [%x[out_ptr], #0x30]\n"
+      "zip2 v16.4s, v19.4s, v16.4s\n"
+      "str q21, [%x[out_ptr], #0x40]\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "sadalp v0.8h, v20.16b\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
+      "sadalp v1.8h, v21.16b\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "sadalp v0.8h, v18.16b\n"
+      "cmp %x[width], #0x10\n"
+      "sadalp v1.8h, v17.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "sadalp v0.8h, v16.16b\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 14f\n"
+      "tbz %x[width], #3, 9f\n"
+      "ldr d29, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "tbz %x[width], #2, 7f\n"
+      "ld1 { v29.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v27.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v26.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x20], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v29.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "ld1 { v27.h }[6], [x25], #0x2\n"
+      "ld1 { v21.h }[6], [x24], #0x2\n"
+      "ld1 { v26.h }[6], [x23], #0x2\n"
+      "ld1 { v25.h }[6], [x22], #0x2\n"
+      "ld1 { v19.h }[6], [x21], #0x2\n"
+      "ld1 { v22.h }[6], [x20], #0x2\n"
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
+      "ld1 { v27.b }[14], [x25]\n"
+      "ld1 { v21.b }[14], [x24]\n"
+      "ld1 { v26.b }[14], [x23]\n"
+      "ld1 { v25.b }[14], [x22]\n"
+      "ld1 { v19.b }[14], [x21]\n"
+      "ld1 { v22.b }[14], [x20]\n"
+      "b 13f\n"
+      "6:"  // odd_loads_1_12
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
+      "ld1 { v27.b }[12], [x25]\n"
+      "ld1 { v21.b }[12], [x24]\n"
+      "ld1 { v26.b }[12], [x23]\n"
+      "ld1 { v25.b }[12], [x22]\n"
+      "ld1 { v19.b }[12], [x21]\n"
+      "ld1 { v22.b }[12], [x20]\n"
+      "mov x19, #0x4\n"
+      "b 13f\n"
+      "7:"  // odd_loads_2_8
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v29.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "ld1 { v27.h }[4], [x25], #0x2\n"
+      "ld1 { v21.h }[4], [x24], #0x2\n"
+      "ld1 { v26.h }[4], [x23], #0x2\n"
+      "ld1 { v25.h }[4], [x22], #0x2\n"
+      "ld1 { v19.h }[4], [x21], #0x2\n"
+      "ld1 { v22.h }[4], [x20], #0x2\n"
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
+      "ld1 { v27.b }[10], [x25]\n"
+      "ld1 { v21.b }[10], [x24]\n"
+      "ld1 { v26.b }[10], [x23]\n"
+      "ld1 { v25.b }[10], [x22]\n"
+      "ld1 { v19.b }[10], [x21]\n"
+      "ld1 { v22.b }[10], [x20]\n"
+      "b 13f\n"
+      "8:"  // odd_loads_1_8
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
+      "ld1 { v27.b }[8], [x25]\n"
+      "ld1 { v21.b }[8], [x24]\n"
+      "ld1 { v26.b }[8], [x23]\n"
+      "ld1 { v25.b }[8], [x22]\n"
+      "ld1 { v19.b }[8], [x21]\n"
+      "ld1 { v22.b }[8], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 13f\n"
+      "9:"  // odd_loads_4_0
+      "tbz %x[width], #2, 11f\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "tbz %x[width], #1, 10f\n"
+      "ld1 { v29.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v27.h }[2], [x25], #0x2\n"
+      "ld1 { v21.h }[2], [x24], #0x2\n"
+      "ld1 { v26.h }[2], [x23], #0x2\n"
+      "ld1 { v25.h }[2], [x22], #0x2\n"
+      "ld1 { v19.h }[2], [x21], #0x2\n"
+      "ld1 { v22.h }[2], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v27.b }[6], [x25]\n"
+      "ld1 { v21.b }[6], [x24]\n"
+      "ld1 { v26.b }[6], [x23]\n"
+      "ld1 { v25.b }[6], [x22]\n"
+      "ld1 { v19.b }[6], [x21]\n"
+      "ld1 { v22.b }[6], [x20]\n"
+      "b 13f\n"
+      "10:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v27.b }[4], [x25]\n"
+      "ld1 { v21.b }[4], [x24]\n"
+      "ld1 { v26.b }[4], [x23]\n"
+      "ld1 { v25.b }[4], [x22]\n"
+      "ld1 { v19.b }[4], [x21]\n"
+      "ld1 { v22.b }[4], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 13f\n"
+      "11:"  // odd_loads_2_0
+      "tbz %x[width], #1, 12f\n"
+      "ldr h29, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h27, [x25], #0x2\n"
+      "ldr h21, [x24], #0x2\n"
+      "ldr h26, [x23], #0x2\n"
+      "ldr h25, [x22], #0x2\n"
+      "ldr h19, [x21], #0x2\n"
+      "ldr h22, [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v27.b }[2], [x25]\n"
+      "ld1 { v21.b }[2], [x24]\n"
+      "ld1 { v26.b }[2], [x23]\n"
+      "ld1 { v25.b }[2], [x22]\n"
+      "ld1 { v19.b }[2], [x21]\n"
+      "ld1 { v22.b }[2], [x20]\n"
+      "b 13f\n"
+      "12:"  // odd_loads_1_0
+      "ldr b29, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b27, [x25, #0x0]\n"
+      "ldr b21, [x24, #0x0]\n"
+      "ldr b26, [x23, #0x0]\n"
+      "ldr b25, [x22, #0x0]\n"
+      "ldr b19, [x21, #0x0]\n"
+      "ldr b22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "13:"  // Odd load end
+      "zip1 v23.4s, v29.4s, v27.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v28.4s, v21.4s\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.4s, v26.4s, v19.4s\n"
+      "sadalp v1.8h, v16.16b\n"
+      "zip1 v17.4s, v25.4s, v22.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "sadalp v0.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 14f\n"
+      "zip2 v24.4s, v23.4s, v20.4s\n"
+      "zip2 v20.4s, v18.4s, v17.4s\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "sadalp v1.8h, v24.16b\n"
+      "str q20, [%x[out_ptr], #0x10]\n"
+      "sadalp v0.8h, v20.16b\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 14f\n"
+      "zip2 v27.4s, v29.4s, v27.4s\n"
+      "zip2 v23.4s, v28.4s, v21.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v21.4s, v27.4s, v23.4s\n"
+      "str q21, [%x[out_ptr], #0x0]\n"
+      "zip2 v19.4s, v26.4s, v19.4s\n"
+      "sadalp v1.8h, v21.16b\n"
+      "zip2 v16.4s, v25.4s, v22.4s\n"
+      "zip1 v18.4s, v19.4s, v16.4s\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "sadalp v0.8h, v18.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 14f\n"
+      "zip2 v17.4s, v27.4s, v23.4s\n"
+      "zip2 v16.4s, v19.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "sadalp v1.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "sadalp v0.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "14:"  // Odds skip
+      "sadalp v31.4s, v1.8h\n"
+      "sadalp v30.4s, v0.8h\n"
+      "str q31, [%x[out_ptr], #0x0]\n"
+      "str q30, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
new file mode 100644
index 0000000000..7c7857bcd0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, true>(
+  uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v1.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v0.8h, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "movi v30.4s, #0x0\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v31.4s }, [%x[out_ptr]]\n"
+      "ldr q30, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x10\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0x1e\n"
+      "ble 4f\n"
+      "uadalp v31.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "uadalp v30.4s, v0.8h\n"
+      "movi v0.8h, #0x0\n"
+      "mov x19, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q29, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q28, [x26], #0x10\n"
+      "ldr q27, [x25], #0x10\n"
+      "zip1 v23.4s, v29.4s, v27.4s\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip2 v27.4s, v29.4s, v27.4s\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q26, [x23], #0x10\n"
+      "zip1 v20.4s, v28.4s, v21.4s\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v24.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "zip2 v23.4s, v28.4s, v21.4s\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v18.4s, v26.4s, v19.4s\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v21.4s, v27.4s, v23.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v17.4s, v25.4s, v22.4s\n"
+      "uadalp v1.8h, v16.16b\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "add x19, x19, #0x1\n"
+      "zip2 v20.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.4s, v26.4s, v19.4s\n"
+      "uadalp v0.8h, v16.16b\n"
+      "zip2 v16.4s, v25.4s, v22.4s\n"
+      "str q24, [%x[out_ptr], #0x20]\n"
+      "zip1 v18.4s, v19.4s, v16.4s\n"
+      "uadalp v1.8h, v24.16b\n"
+      "zip2 v17.4s, v27.4s, v23.4s\n"
+      "str q20, [%x[out_ptr], #0x30]\n"
+      "zip2 v16.4s, v19.4s, v16.4s\n"
+      "str q21, [%x[out_ptr], #0x40]\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "uadalp v0.8h, v20.16b\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
+      "uadalp v1.8h, v21.16b\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "uadalp v0.8h, v18.16b\n"
+      "cmp %x[width], #0x10\n"
+      "uadalp v1.8h, v17.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "uadalp v0.8h, v16.16b\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 14f\n"
+      "tbz %x[width], #3, 9f\n"
+      "ldr d29, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "tbz %x[width], #2, 7f\n"
+      "ld1 { v29.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v27.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v26.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x20], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v29.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "ld1 { v27.h }[6], [x25], #0x2\n"
+      "ld1 { v21.h }[6], [x24], #0x2\n"
+      "ld1 { v26.h }[6], [x23], #0x2\n"
+      "ld1 { v25.h }[6], [x22], #0x2\n"
+      "ld1 { v19.h }[6], [x21], #0x2\n"
+      "ld1 { v22.h }[6], [x20], #0x2\n"
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
+      "ld1 { v27.b }[14], [x25]\n"
+      "ld1 { v21.b }[14], [x24]\n"
+      "ld1 { v26.b }[14], [x23]\n"
+      "ld1 { v25.b }[14], [x22]\n"
+      "ld1 { v19.b }[14], [x21]\n"
+      "ld1 { v22.b }[14], [x20]\n"
+      "b 13f\n"
+      "6:"  // odd_loads_1_12
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
+      "ld1 { v27.b }[12], [x25]\n"
+      "ld1 { v21.b }[12], [x24]\n"
+      "ld1 { v26.b }[12], [x23]\n"
+      "ld1 { v25.b }[12], [x22]\n"
+      "ld1 { v19.b }[12], [x21]\n"
+      "ld1 { v22.b }[12], [x20]\n"
+      "mov x19, #0x4\n"
+      "b 13f\n"
+      "7:"  // odd_loads_2_8
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v29.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "ld1 { v27.h }[4], [x25], #0x2\n"
+      "ld1 { v21.h }[4], [x24], #0x2\n"
+      "ld1 { v26.h }[4], [x23], #0x2\n"
+      "ld1 { v25.h }[4], [x22], #0x2\n"
+      "ld1 { v19.h }[4], [x21], #0x2\n"
+      "ld1 { v22.h }[4], [x20], #0x2\n"
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
+      "ld1 { v27.b }[10], [x25]\n"
+      "ld1 { v21.b }[10], [x24]\n"
+      "ld1 { v26.b }[10], [x23]\n"
+      "ld1 { v25.b }[10], [x22]\n"
+      "ld1 { v19.b }[10], [x21]\n"
+      "ld1 { v22.b }[10], [x20]\n"
+      "b 13f\n"
+      "8:"  // odd_loads_1_8
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
+      "ld1 { v27.b }[8], [x25]\n"
+      "ld1 { v21.b }[8], [x24]\n"
+      "ld1 { v26.b }[8], [x23]\n"
+      "ld1 { v25.b }[8], [x22]\n"
+      "ld1 { v19.b }[8], [x21]\n"
+      "ld1 { v22.b }[8], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 13f\n"
+      "9:"  // odd_loads_4_0
+      "tbz %x[width], #2, 11f\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "tbz %x[width], #1, 10f\n"
+      "ld1 { v29.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v27.h }[2], [x25], #0x2\n"
+      "ld1 { v21.h }[2], [x24], #0x2\n"
+      "ld1 { v26.h }[2], [x23], #0x2\n"
+      "ld1 { v25.h }[2], [x22], #0x2\n"
+      "ld1 { v19.h }[2], [x21], #0x2\n"
+      "ld1 { v22.h }[2], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v27.b }[6], [x25]\n"
+      "ld1 { v21.b }[6], [x24]\n"
+      "ld1 { v26.b }[6], [x23]\n"
+      "ld1 { v25.b }[6], [x22]\n"
+      "ld1 { v19.b }[6], [x21]\n"
+      "ld1 { v22.b }[6], [x20]\n"
+      "b 13f\n"
+      "10:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v27.b }[4], [x25]\n"
+      "ld1 { v21.b }[4], [x24]\n"
+      "ld1 { v26.b }[4], [x23]\n"
+      "ld1 { v25.b }[4], [x22]\n"
+      "ld1 { v19.b }[4], [x21]\n"
+      "ld1 { v22.b }[4], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 13f\n"
+      "11:"  // odd_loads_2_0
+      "tbz %x[width], #1, 12f\n"
+      "ldr h29, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h27, [x25], #0x2\n"
+      "ldr h21, [x24], #0x2\n"
+      "ldr h26, [x23], #0x2\n"
+      "ldr h25, [x22], #0x2\n"
+      "ldr h19, [x21], #0x2\n"
+      "ldr h22, [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v27.b }[2], [x25]\n"
+      "ld1 { v21.b }[2], [x24]\n"
+      "ld1 { v26.b }[2], [x23]\n"
+      "ld1 { v25.b }[2], [x22]\n"
+      "ld1 { v19.b }[2], [x21]\n"
+      "ld1 { v22.b }[2], [x20]\n"
+      "b 13f\n"
+      "12:"  // odd_loads_1_0
+      "ldr b29, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b27, [x25, #0x0]\n"
+      "ldr b21, [x24, #0x0]\n"
+      "ldr b26, [x23, #0x0]\n"
+      "ldr b25, [x22, #0x0]\n"
+      "ldr b19, [x21, #0x0]\n"
+      "ldr b22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "13:"  // Odd load end
+      "zip1 v23.4s, v29.4s, v27.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v28.4s, v21.4s\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.4s, v26.4s, v19.4s\n"
+      "uadalp v1.8h, v16.16b\n"
+      "zip1 v17.4s, v25.4s, v22.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "uadalp v0.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 14f\n"
+      "zip2 v24.4s, v23.4s, v20.4s\n"
+      "zip2 v20.4s, v18.4s, v17.4s\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "uadalp v1.8h, v24.16b\n"
+      "str q20, [%x[out_ptr], #0x10]\n"
+      "uadalp v0.8h, v20.16b\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 14f\n"
+      "zip2 v27.4s, v29.4s, v27.4s\n"
+      "zip2 v23.4s, v28.4s, v21.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v21.4s, v27.4s, v23.4s\n"
+      "str q21, [%x[out_ptr], #0x0]\n"
+      "zip2 v19.4s, v26.4s, v19.4s\n"
+      "uadalp v1.8h, v21.16b\n"
+      "zip2 v16.4s, v25.4s, v22.4s\n"
+      "zip1 v18.4s, v19.4s, v16.4s\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "uadalp v0.8h, v18.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 14f\n"
+      "zip2 v17.4s, v27.4s, v23.4s\n"
+      "zip2 v16.4s, v19.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "uadalp v1.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "uadalp v0.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "14:"  // Odds skip
+      "uadalp v31.4s, v1.8h\n"
+      "uadalp v30.4s, v0.8h\n"
+      "str q31, [%x[out_ptr], #0x0]\n"
+      "str q30, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
new file mode 100644
index 0000000000..704a4c9210
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 8, VLType::None, false>(
+  int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x10\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q27, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q25, [x25], #0x10\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "str q24, [%x[out_ptr], #0x40]\n"
+      "str q21, [%x[out_ptr], #0x50]\n"
+      "str q18, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "cmp %x[width], #0x10\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 12f\n"
+      "tbz %x[width], #3, 7f\n"
+      "ldr d27, [x27], #0x8\n"
+      "ldr d24, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "tbz %x[width], #2, 5f\n"
+      "ld1 { v27.s }[2], [x27], #0x4\n"
+      "ld1 { v24.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v27.h }[6], [x27], #0x2\n"
+      "ld1 { v24.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "ld1 { v21.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v18.h }[6], [x22], #0x2\n"
+      "ld1 { v19.h }[6], [x21], #0x2\n"
+      "ld1 { v16.h }[6], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[14], [x27]\n"
+      "ld1 { v24.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
+      "ld1 { v21.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v18.b }[14], [x22]\n"
+      "ld1 { v19.b }[14], [x21]\n"
+      "ld1 { v16.b }[14], [x20]\n"
+      "b 11f\n"
+      "4:"  // odd_loads_1_12
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[12], [x27]\n"
+      "ld1 { v24.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
+      "ld1 { v21.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v18.b }[12], [x22]\n"
+      "ld1 { v19.b }[12], [x21]\n"
+      "ld1 { v16.b }[12], [x20]\n"
+      "b 11f\n"
+      "5:"  // odd_loads_2_8
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v27.h }[4], [x27], #0x2\n"
+      "ld1 { v24.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "ld1 { v21.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v18.h }[4], [x22], #0x2\n"
+      "ld1 { v19.h }[4], [x21], #0x2\n"
+      "ld1 { v16.h }[4], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[10], [x27]\n"
+      "ld1 { v24.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
+      "ld1 { v21.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v18.b }[10], [x22]\n"
+      "ld1 { v19.b }[10], [x21]\n"
+      "ld1 { v16.b }[10], [x20]\n"
+      "b 11f\n"
+      "6:"  // odd_loads_1_8
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[8], [x27]\n"
+      "ld1 { v24.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
+      "ld1 { v21.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v18.b }[8], [x22]\n"
+      "ld1 { v19.b }[8], [x21]\n"
+      "ld1 { v16.b }[8], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 11f\n"
+      "7:"  // odd_loads_4_0
+      "tbz %x[width], #2, 9f\n"
+      "ldr s27, [x27], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s18, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v27.h }[2], [x27], #0x2\n"
+      "ld1 { v24.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "ld1 { v21.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v18.h }[2], [x22], #0x2\n"
+      "ld1 { v19.h }[2], [x21], #0x2\n"
+      "ld1 { v16.h }[2], [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[6], [x27]\n"
+      "ld1 { v24.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
+      "ld1 { v21.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v18.b }[6], [x22]\n"
+      "ld1 { v19.b }[6], [x21]\n"
+      "ld1 { v16.b }[6], [x20]\n"
+      "b 11f\n"
+      "8:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[4], [x27]\n"
+      "ld1 { v24.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
+      "ld1 { v21.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v18.b }[4], [x22]\n"
+      "ld1 { v19.b }[4], [x21]\n"
+      "ld1 { v16.b }[4], [x20]\n"
+      "b 11f\n"
+      "9:"  // odd_loads_2_0
+      "tbz %x[width], #1, 10f\n"
+      "ldr h27, [x27], #0x2\n"
+      "ldr h24, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "ldr h21, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h18, [x22], #0x2\n"
+      "ldr h19, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[2], [x27]\n"
+      "ld1 { v24.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
+      "ld1 { v21.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v18.b }[2], [x22]\n"
+      "ld1 { v19.b }[2], [x21]\n"
+      "ld1 { v16.b }[2], [x20]\n"
+      "b 11f\n"
+      "10:"  // odd_loads_1_0
+      "ldr b27, [x27, #0x0]\n"
+      "ldr b24, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
+      "ldr b21, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b18, [x22, #0x0]\n"
+      "ldr b19, [x21, #0x0]\n"
+      "ldr b16, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "11:"  // Odd load end
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "beq 12f\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "str q21, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q18, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "12:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+template<>
+void interleave_block<8, 8, VLType::None, false>(
+  uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  int8_t * &out_cast = reinterpret_cast<int8_t * &>(out_ptr);
+  const int8_t * const * in_cast = reinterpret_cast<const int8_t * const *>(in);
+
+  interleave_block<8, 8, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false);
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
new file mode 100644
index 0000000000..2317ece790
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 8, VLType::None, true>(
+  int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v5.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v4.8h, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v3.8h, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "movi v2.8h, #0x0\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "movi v1.4s, #0x0\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "movi v0.4s, #0x0\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "movi v30.4s, #0x0\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "movi v29.4s, #0x0\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "movi v28.4s, #0x0\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v29.4s }, [%x[out_ptr]]\n"
+      "ldr q28, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x10\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0x3e\n"
+      "ble 4f\n"
+      "sadalp v1.4s, v5.8h\n"
+      "movi v5.8h, #0x0\n"
+      "sadalp v0.4s, v4.8h\n"
+      "movi v4.8h, #0x0\n"
+      "sadalp v31.4s, v3.8h\n"
+      "movi v3.8h, #0x0\n"
+      "sadalp v30.4s, v2.8h\n"
+      "movi v2.8h, #0x0\n"
+      "mov x19, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q27, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q25, [x25], #0x10\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "sadalp v5.8h, v26.16b\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "sadalp v4.8h, v23.16b\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "sadalp v3.8h, v20.16b\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "sadalp v2.8h, v17.16b\n"
+      "str q24, [%x[out_ptr], #0x40]\n"
+      "sadalp v5.8h, v24.16b\n"
+      "str q21, [%x[out_ptr], #0x50]\n"
+      "sadalp v4.8h, v21.16b\n"
+      "str q18, [%x[out_ptr], #0x60]\n"
+      "sadalp v3.8h, v18.16b\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "sadalp v2.8h, v16.16b\n"
+      "add x19, x19, #0x1\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "cmp %x[width], #0x10\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 14f\n"
+      "tbz %x[width], #3, 9f\n"
+      "ldr d27, [x27], #0x8\n"
+      "ldr d24, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "tbz %x[width], #2, 7f\n"
+      "ld1 { v27.s }[2], [x27], #0x4\n"
+      "ld1 { v24.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v27.h }[6], [x27], #0x2\n"
+      "ld1 { v24.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "ld1 { v21.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v18.h }[6], [x22], #0x2\n"
+      "ld1 { v19.h }[6], [x21], #0x2\n"
+      "ld1 { v16.h }[6], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[14], [x27]\n"
+      "ld1 { v24.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
+      "ld1 { v21.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v18.b }[14], [x22]\n"
+      "ld1 { v19.b }[14], [x21]\n"
+      "ld1 { v16.b }[14], [x20]\n"
+      "b 13f\n"
+      "6:"  // odd_loads_1_12
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[12], [x27]\n"
+      "ld1 { v24.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
+      "ld1 { v21.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v18.b }[12], [x22]\n"
+      "ld1 { v19.b }[12], [x21]\n"
+      "ld1 { v16.b }[12], [x20]\n"
+      "b 13f\n"
+      "7:"  // odd_loads_2_8
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v27.h }[4], [x27], #0x2\n"
+      "ld1 { v24.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "ld1 { v21.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v18.h }[4], [x22], #0x2\n"
+      "ld1 { v19.h }[4], [x21], #0x2\n"
+      "ld1 { v16.h }[4], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[10], [x27]\n"
+      "ld1 { v24.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
+      "ld1 { v21.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v18.b }[10], [x22]\n"
+      "ld1 { v19.b }[10], [x21]\n"
+      "ld1 { v16.b }[10], [x20]\n"
+      "b 13f\n"
+      "8:"  // odd_loads_1_8
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[8], [x27]\n"
+      "ld1 { v24.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
+      "ld1 { v21.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v18.b }[8], [x22]\n"
+      "ld1 { v19.b }[8], [x21]\n"
+      "ld1 { v16.b }[8], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 13f\n"
+      "9:"  // odd_loads_4_0
+      "tbz %x[width], #2, 11f\n"
+      "ldr s27, [x27], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s18, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "tbz %x[width], #1, 10f\n"
+      "ld1 { v27.h }[2], [x27], #0x2\n"
+      "ld1 { v24.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "ld1 { v21.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v18.h }[2], [x22], #0x2\n"
+      "ld1 { v19.h }[2], [x21], #0x2\n"
+      "ld1 { v16.h }[2], [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[6], [x27]\n"
+      "ld1 { v24.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
+      "ld1 { v21.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v18.b }[6], [x22]\n"
+      "ld1 { v19.b }[6], [x21]\n"
+      "ld1 { v16.b }[6], [x20]\n"
+      "b 13f\n"
+      "10:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[4], [x27]\n"
+      "ld1 { v24.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
+      "ld1 { v21.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v18.b }[4], [x22]\n"
+      "ld1 { v19.b }[4], [x21]\n"
+      "ld1 { v16.b }[4], [x20]\n"
+      "b 13f\n"
+      "11:"  // odd_loads_2_0
+      "tbz %x[width], #1, 12f\n"
+      "ldr h27, [x27], #0x2\n"
+      "ldr h24, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "ldr h21, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h18, [x22], #0x2\n"
+      "ldr h19, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[2], [x27]\n"
+      "ld1 { v24.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
+      "ld1 { v21.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v18.b }[2], [x22]\n"
+      "ld1 { v19.b }[2], [x21]\n"
+      "ld1 { v16.b }[2], [x20]\n"
+      "b 13f\n"
+      "12:"  // odd_loads_1_0
+      "ldr b27, [x27, #0x0]\n"
+      "ldr b24, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
+      "ldr b21, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b18, [x22, #0x0]\n"
+      "ldr b19, [x21, #0x0]\n"
+      "ldr b16, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "13:"  // Odd load end
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "sadalp v5.8h, v26.16b\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "sadalp v4.8h, v23.16b\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "sadalp v3.8h, v20.16b\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "sadalp v2.8h, v17.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "beq 14f\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "sadalp v5.8h, v24.16b\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q21, [%x[out_ptr], #0x10]\n"
+      "sadalp v4.8h, v21.16b\n"
+      "str q18, [%x[out_ptr], #0x20]\n"
+      "sadalp v3.8h, v18.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "sadalp v2.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "14:"  // Odds skip
+      "sadalp v1.4s, v5.8h\n"
+      "sadalp v0.4s, v4.8h\n"
+      "addp v1.4s, v1.4s, v0.4s\n"
+      "sadalp v31.4s, v3.8h\n"
+      "sadalp v30.4s, v2.8h\n"
+      "add v1.4s, v1.4s, v29.4s\n"
+      "str q1, [%x[out_ptr], #0x0]\n"
+      "addp v0.4s, v31.4s, v30.4s\n"
+      "add v0.4s, v0.4s, v28.4s\n"
+      "str q0, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
new file mode 100644
index 0000000000..07164d6b24
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 8, VLType::None, true>(
+  uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v5.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v4.8h, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v3.8h, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "movi v2.8h, #0x0\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "movi v1.4s, #0x0\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "movi v0.4s, #0x0\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "movi v30.4s, #0x0\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "movi v29.4s, #0x0\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "movi v28.4s, #0x0\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v29.4s }, [%x[out_ptr]]\n"
+      "ldr q28, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x10\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0x3e\n"
+      "ble 4f\n"
+      "uadalp v1.4s, v5.8h\n"
+      "movi v5.8h, #0x0\n"
+      "uadalp v0.4s, v4.8h\n"
+      "movi v4.8h, #0x0\n"
+      "uadalp v31.4s, v3.8h\n"
+      "movi v3.8h, #0x0\n"
+      "uadalp v30.4s, v2.8h\n"
+      "movi v2.8h, #0x0\n"
+      "mov x19, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q27, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q25, [x25], #0x10\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "uadalp v5.8h, v26.16b\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "uadalp v4.8h, v23.16b\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "uadalp v3.8h, v20.16b\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "uadalp v2.8h, v17.16b\n"
+      "str q24, [%x[out_ptr], #0x40]\n"
+      "uadalp v5.8h, v24.16b\n"
+      "str q21, [%x[out_ptr], #0x50]\n"
+      "uadalp v4.8h, v21.16b\n"
+      "str q18, [%x[out_ptr], #0x60]\n"
+      "uadalp v3.8h, v18.16b\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "uadalp v2.8h, v16.16b\n"
+      "add x19, x19, #0x1\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "cmp %x[width], #0x10\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 14f\n"
+      "tbz %x[width], #3, 9f\n"
+      "ldr d27, [x27], #0x8\n"
+      "ldr d24, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "tbz %x[width], #2, 7f\n"
+      "ld1 { v27.s }[2], [x27], #0x4\n"
+      "ld1 { v24.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v27.h }[6], [x27], #0x2\n"
+      "ld1 { v24.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "ld1 { v21.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v18.h }[6], [x22], #0x2\n"
+      "ld1 { v19.h }[6], [x21], #0x2\n"
+      "ld1 { v16.h }[6], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[14], [x27]\n"
+      "ld1 { v24.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
+      "ld1 { v21.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v18.b }[14], [x22]\n"
+      "ld1 { v19.b }[14], [x21]\n"
+      "ld1 { v16.b }[14], [x20]\n"
+      "b 13f\n"
+      "6:"  // odd_loads_1_12
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[12], [x27]\n"
+      "ld1 { v24.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
+      "ld1 { v21.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v18.b }[12], [x22]\n"
+      "ld1 { v19.b }[12], [x21]\n"
+      "ld1 { v16.b }[12], [x20]\n"
+      "b 13f\n"
+      "7:"  // odd_loads_2_8
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v27.h }[4], [x27], #0x2\n"
+      "ld1 { v24.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "ld1 { v21.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v18.h }[4], [x22], #0x2\n"
+      "ld1 { v19.h }[4], [x21], #0x2\n"
+      "ld1 { v16.h }[4], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[10], [x27]\n"
+      "ld1 { v24.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
+      "ld1 { v21.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v18.b }[10], [x22]\n"
+      "ld1 { v19.b }[10], [x21]\n"
+      "ld1 { v16.b }[10], [x20]\n"
+      "b 13f\n"
+      "8:"  // odd_loads_1_8
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[8], [x27]\n"
+      "ld1 { v24.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
+      "ld1 { v21.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v18.b }[8], [x22]\n"
+      "ld1 { v19.b }[8], [x21]\n"
+      "ld1 { v16.b }[8], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 13f\n"
+      "9:"  // odd_loads_4_0
+      "tbz %x[width], #2, 11f\n"
+      "ldr s27, [x27], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s18, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "tbz %x[width], #1, 10f\n"
+      "ld1 { v27.h }[2], [x27], #0x2\n"
+      "ld1 { v24.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "ld1 { v21.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v18.h }[2], [x22], #0x2\n"
+      "ld1 { v19.h }[2], [x21], #0x2\n"
+      "ld1 { v16.h }[2], [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[6], [x27]\n"
+      "ld1 { v24.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
+      "ld1 { v21.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v18.b }[6], [x22]\n"
+      "ld1 { v19.b }[6], [x21]\n"
+      "ld1 { v16.b }[6], [x20]\n"
+      "b 13f\n"
+      "10:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[4], [x27]\n"
+      "ld1 { v24.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
+      "ld1 { v21.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v18.b }[4], [x22]\n"
+      "ld1 { v19.b }[4], [x21]\n"
+      "ld1 { v16.b }[4], [x20]\n"
+      "b 13f\n"
+      "11:"  // odd_loads_2_0
+      "tbz %x[width], #1, 12f\n"
+      "ldr h27, [x27], #0x2\n"
+      "ldr h24, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "ldr h21, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h18, [x22], #0x2\n"
+      "ldr h19, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[2], [x27]\n"
+      "ld1 { v24.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
+      "ld1 { v21.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v18.b }[2], [x22]\n"
+      "ld1 { v19.b }[2], [x21]\n"
+      "ld1 { v16.b }[2], [x20]\n"
+      "b 13f\n"
+      "12:"  // odd_loads_1_0
+      "ldr b27, [x27, #0x0]\n"
+      "ldr b24, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
+      "ldr b21, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b18, [x22, #0x0]\n"
+      "ldr b19, [x21, #0x0]\n"
+      "ldr b16, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "13:"  // Odd load end
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "uadalp v5.8h, v26.16b\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "uadalp v4.8h, v23.16b\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "uadalp v3.8h, v20.16b\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "uadalp v2.8h, v17.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "beq 14f\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "uadalp v5.8h, v24.16b\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q21, [%x[out_ptr], #0x10]\n"
+      "uadalp v4.8h, v21.16b\n"
+      "str q18, [%x[out_ptr], #0x20]\n"
+      "uadalp v3.8h, v18.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "uadalp v2.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "14:"  // Odds skip
+      "uadalp v1.4s, v5.8h\n"
+      "uadalp v0.4s, v4.8h\n"
+      "addp v1.4s, v1.4s, v0.4s\n"
+      "uadalp v31.4s, v3.8h\n"
+      "uadalp v30.4s, v2.8h\n"
+      "add v1.4s, v1.4s, v29.4s\n"
+      "str q1, [%x[out_ptr], #0x0]\n"
+      "addp v0.4s, v31.4s, v30.4s\n"
+      "add v0.4s, v0.4s, v28.4s\n"
+      "str q0, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp
new file mode 100644
index 0000000000..52b49c0f0c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "a32_interleave6_block1_fp32_fp32.hpp"
+#include "a64_interleave4_block16_s8_s8.hpp"
+#include "a64_interleave4_block16_s8_s8_summing.hpp"
+#include "a64_interleave4_block16_u8_u8_summing.hpp"
+#include "a64_interleave8_block1_bf16_fp32.hpp"
+#include "a64_interleave8_block1_fp16_fp16.hpp"
+#include "a64_interleave8_block1_fp16_fp32.hpp"
+#include "a64_interleave8_block1_fp32_fp32.hpp"
+#include "a64_interleave8_block1_s16_s16.hpp"
+#include "a64_interleave8_block1_s16_s16_summing.hpp"
+#include "a64_interleave8_block1_s8_s16.hpp"
+#include "a64_interleave8_block1_s8_s16_summing.hpp"
+#include "a64_interleave8_block1_u16_u16_summing.hpp"
+#include "a64_interleave8_block1_u8_u16.hpp"
+#include "a64_interleave8_block1_u8_u16_summing.hpp"
+#include "a64_interleave8_block2_bf16_bf16.hpp"
+#include "a64_interleave8_block2_fp32_fp32.hpp"
+#include "a64_interleave8_block4_bf16_bf16.hpp"
+#include "a64_interleave8_block4_s8_s8.hpp"
+#include "a64_interleave8_block4_s8_s8_summing.hpp"
+#include "a64_interleave8_block4_u8_u8_summing.hpp"
+#include "a64_interleave8_block8_s8_s8.hpp"
+#include "a64_interleave8_block8_s8_s8_summing.hpp"
+#include "a64_interleave8_block8_u8_u8_summing.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
new file mode 100644
index 0000000000..2b3e170a3b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "asmlib.hpp"
+#include "convolution_parameters.hpp"
+#include "convolver.hpp"
+#include "interleave_indirect.hpp"
+#include "bfloat.hpp"
+
+#include <alloca.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include <arm_neon.h>
+
+#include "utils.hpp"
+
+namespace arm_gemm {
+
+/*
+ * Core function that does heavy lifting - interleave 'int_by' rows of width 'width' together.
+ *
+ * 'height' indicates the actual number of rows to interleave, so if it's less than int_by then the remaining
+ * entries are padded (note that this is "GEMM" padding rather than convolution padding, so there is no need to pad
+ * with a particular value.
+ *
+ * Note that it is not expected for this templated version to ever be used - all cases that matter should be
+ * explicitly specialized with an optimized implementation.
+ */
+template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
+void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
+    const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+    std::vector<int32_t> the_sums;
+
+    if (integrate_sums) {
+        the_sums = std::vector<int32_t>(int_by, 0);
+
+        if (!first) {
+            // In 'integrate sums' mode, we dump the sums at the end on each pass.
+
+            // On the last pass this is correct, but on other passes it is not -
+            // so on the subsequent pass we need to take the output written by
+            // the previous pass as starting point for the sums, and then
+            // overwrite them with new interleaved data.
+            int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+            // Rewind pointer to where we wrote out the sums last time.
+            out_int32 -= int_by;
+
+            // Restore the running sums.
+            memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
+
+            // Update the "real" pointer so that the next output will clobber the old sums.
+            out = reinterpret_cast<TOut *>(out_int32);
+        }
+    }
+
+    for (unsigned int pos=0; pos<width; pos+=block) {
+        for (unsigned int row=0; row<int_by; row++) {
+            // Row out of range - pad 'block' entries.
+            if (row >= height) {
+                for (unsigned int col=0; col<block; col++) {
+                    *out++ = 0;
+                }
+                continue;
+            }
+
+            for (unsigned int col=0; col<block; col++) {
+                // Column out of range - pad a single entry
+                if (pos + col >= width) {
+                    *out++ = 0;
+                    continue;
+                }
+
+                if (integrate_sums) {
+                    the_sums[row] += in[row][row_offset + pos + col];
+                }
+
+                *out++ = in[row][row_offset + pos + col];
+            }
+        }
+    }
+
+    if (integrate_sums) {
+        int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+        memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
+
+        out = reinterpret_cast<TOut *>(out_int32 + int_by);
+    }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
+inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
+    const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+    // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
+    if (row_sum_multiplier) {
+        // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
+        // next block (post sums).
+        // We need to go back and apply the multiplier to the computed sums.  We don't need to change 'out'.
+        int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+        out_int32 -= height;
+        for (unsigned int i=0; i<height; i++) {
+            out_int32[i] *= row_sum_multiplier;
+        }
+    } else {
+        // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
+        // sum block.  We need to insert the (zero) sums, and advance 'out'.
+        int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+        for (unsigned int i=0; i<height; i++) {
+            out_int32[i] = 0;
+        }
+
+        out_int32 += height;
+
+        out = reinterpret_cast<TOut *>(out_int32);
+    }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen,
+                        unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
+                        const unsigned int k0, const unsigned int kmax, bool integrate_sums,
+                        const int32_t row_sum_multiplier) {
+    const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+    // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
+    // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
+    // out of range rows).  This allows interleave_block to use techniques like row predication, or loading all
+    // pointers and conditionally overriding the out of range ones.
+
+    // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
+    // range reads.  Avoid this with a local buffer to use in last-rows cases.  Use alloca as a std::vector can be
+    // expensive in highly threaded scenarios.
+    const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+    // Figure out the starting position based on k0 (with rounded length)
+    unsigned int start_string      = k0 / rounded_stringlen;
+    unsigned int start_stringpos   = k0 % rounded_stringlen;
+
+    // Process blocks of 'height' height...
+    for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
+        // Height to process
+        unsigned int active_height = std::min(ymax - ybase, height);
+
+        // Track our progress through the various strings
+        unsigned int k_left    = (kmax - k0);
+        unsigned int string    = start_string;
+        unsigned int stringpos = start_stringpos;
+
+        bool first = true;
+
+        // Prepare to call 'interleave_block' above for each string encompassed by K range
+        while (k_left > 0) {
+            // Width to process - and the width we will generate (with padding)
+            unsigned int in_width   = std::min(k_left, stringlen - stringpos);
+            unsigned int out_width  = std::min(k_left, rounded_stringlen - stringpos);
+
+            const TIn * const *row_base = ptr[string] + ybase;
+
+            // If not all rows are valid, copy the ones that are into local array (see above comment).
+            if (active_height < height) {
+                for (unsigned int i=0; i<active_height; i++) {
+                    row_ptrs[i] = ptr[string][ybase + i];
+                }
+
+                row_base = row_ptrs;
+            }
+
+            // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
+            // much code.  However, integrated sums make no sense for non-integral types and won't ever be
+            // requested.  So put a type trait check here to avoid generating pointless code.
+            if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+                interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
+            } else {
+                interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
+            }
+
+            k_left -= out_width;
+            string++;
+            stringpos=0;
+            first=false;
+        }
+
+        if (std::is_integral<TOut>::value && integrate_sums) {
+            FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+        }
+    }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
+        const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+    const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+    auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
+
+    // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
+    const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+    for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
+        // How many of the rows are active - the rest will get padded in interleave_block.
+        unsigned int active_height   = std::min(ymax - ybase, height);
+        bool first = true;
+
+        auto conv_rows = conv_cols.process_rows(ybase, active_height);
+
+        while (!conv_rows.finished()) {
+            unsigned int width, offset;
+
+            // Get next set of parameters
+            std::tie(width, offset) = conv_rows.next_block(row_ptrs);
+
+            // Perform the interleave
+            if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+                interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
+            } else {
+                interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
+            }
+
+            first=false;
+        }
+
+        if (std::is_integral<TOut>::value && integrate_sums) {
+            FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+        }
+    }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+    const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+    // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
+    const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+    const unsigned int width=kmax-k0;
+
+    for (unsigned int y=y0; y<ymax; y+=height) {
+        for (unsigned int r=0; r<height; r++) {
+            row_ptrs[r] = in + ((y + r) * in_stride);
+        }
+
+        if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+            interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
+        } else {
+            interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
+        }
+
+        if (std::is_integral<TOut>::value && integrate_sums) {
+            FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+        }
+    }
+}
+
+#include "indirect-interleaves/list.hpp"
+
+/**** Instantiate needed implementations ****/
+
+/* AArch32 */
+#ifdef __arm__
+/* FP32 */
+/* NEON implementation (height 6) */
+template void IndirectInterleave<6, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<6, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<6, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FP16 */
+#if __ARM_FP16_ARGS
+/* NEON implementation using FP32 kernel (height 6) */
+template void IndirectInterleave<6, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+#endif /* __ARM_FP16_ARGS */
+
+/* BF16 */
+/* NEON implementation using FP32 kernel */
+template void IndirectInterleave<6, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+#endif
+
+/* AArch64 */
+#ifdef __aarch64__
+/* FP64 */
+/* NEON/SVE implementation (height 8) */
+template void IndirectInterleave<8, 1, VLType::None>(double *, const double * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(double *, const double *, size_t, const convolver<double> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(double *, const double *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FP32 */
+/* NEON/SVE implementation (height 8) */
+template void IndirectInterleave<8, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FMMLA */
+template void IndirectInterleave<8, 2, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 2, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 2, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FP16 */
+template void IndirectInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<8, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* BF16 */
+/* NEON/SVE BFDOT */
+template void IndirectInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON/SVE using FP32 kernel */
+template void IndirectInterleave<8, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* INT16 */
+template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, const convolver<int16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, const convolver<uint16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* INT8 */
+/* NEON SMLA/SMLAL (height 4, block 16) */
+template void IndirectInterleave<4, 16, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON SDOT (height 8, block 4) */
+template void IndirectInterleave<8, 4, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* MMLA SMMLA (height 8, block 8) */
+template void IndirectInterleave<8, 8, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON SDOT (height 8, block 1) */
+template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON SMLA/SMLAL (height 4, block 16) */
+template void IndirectInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON SDOT (height 8, block 4) */
+template void IndirectInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* MMLA SMMLA (height 8, block 8) */
+template void IndirectInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON 16-bit (height 8, block 1) */
+template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+#endif // __aarch64__
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp
new file mode 100644
index 0000000000..660577f0e3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#include "convolution_parameters.hpp"
+#include "convolver.hpp"
+#include "utils.hpp"
+
+namespace arm_gemm {
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen, unsigned int rounded_stringlen, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool, int32_t);
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool, int32_t);
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp
index 0f0e5a7ed4..8bf8d8442e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp
@@ -30,9 +30,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, int, int);
+void a64_gemm_s16_asimd_8x12(const int16_t *, const int16_t *, int32_t *, int, int, int);
 
-// 12x8 SGEMM "strategy" class.
+// 8x12 SGEMM "strategy" class.
 //
 // This describes the characteristics of a family of kernels, in terms of
 // the required interleave properties and the output block size.
@@ -40,7 +40,7 @@ void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, i
 // All kernels in the family must share these characteristics.  The actual
 // kernel to be used can be chosen at runtime, based on the CPU_type
 // structure.
-class gemm_s16_12x8 {
+class cls_a64_gemm_s16_8x12 {
 public:
     typedef int16_t operand_type;
     typedef int32_t result_type;
@@ -62,10 +62,11 @@ public:
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
 
-    kern_type kernel = a64_gemm_s16_asimd_12x8;
+    kern_type kernel = a64_gemm_s16_asimd_8x12;
 
-    gemm_s16_12x8(const CPUInfo *) { }
+    cls_a64_gemm_s16_8x12(const CPUInfo *) { }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp
index 7052f83a3d..a77938ffa7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+void a64_gemm_s16_asimd_8x12(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
 {
   const int16_t *a_ptr = Apanel;
   int32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index 256acc4c65..b68a5f518a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
@@ -34,7 +34,7 @@ void a64_gemm_s8_4x4(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
 #include "arm_gemm.hpp"
 
-class gemm_s8_4x4 {
+class cls_a64_gemm_s8_4x4 {
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
@@ -56,10 +56,11 @@ public:
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
 
     kern_type kernel=a64_gemm_s8_4x4;
 
-    gemm_s8_4x4(const CPUInfo *) { }
+    cls_a64_gemm_s8_4x4(const CPUInfo *) { }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
index 0e294bfe8d..eee817e8e7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
@@ -32,11 +32,11 @@
 namespace arm_gemm {
 
 // Load the actual kernel
-void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
-void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
-void a64_gemm_s8_12x8_x1(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_8x12(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_8x12_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_8x12_x1(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
-class gemm_s8_12x8 {
+class cls_a64_gemm_s8_8x12 {
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
@@ -58,16 +58,17 @@ public:
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
 
-    kern_type kernel = a64_gemm_s8_12x8;
+    kern_type kernel = a64_gemm_s8_8x12;
 
-    gemm_s8_12x8(const CPUInfo *ci) {
+    cls_a64_gemm_s8_8x12(const CPUInfo *ci) {
         auto mod = ci->get_cpu_model();
 
         if (mod == CPUModel::A55r1) {
-            kernel = a64_gemm_s8_12x8_a55r1;
+            kernel = a64_gemm_s8_8x12_a55r1;
         } else if (mod == CPUModel::X1) {
-            kernel = a64_gemm_s8_12x8_x1;
+            kernel = a64_gemm_s8_8x12_x1;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp
index ddd8124ec9..bb5226e093 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
+void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
     const int8_t *a_ptr = Apanel;
     int32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp
index a7abaed9e0..7bf36a5900 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_s8_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
     const int8_t *a_ptr = Apanel;
     int32_t *c_ptr = Cpanel;
     // We divide K by 4 because the sdot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp
index 446fcf8707..afd2427b85 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_s8_12x8_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_s8_8x12_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
     const int8_t *a_ptr = Apanel;
     int32_t *c_ptr = Cpanel;
     // We divide K by 4 because the sdot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp
index b86204043c..e49ebbd84e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp
@@ -30,17 +30,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_gemm_u16_asimd_12x8(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
+void a64_gemm_u16_asimd_8x12(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
 
-// 12x8 SGEMM "strategy" class.
-//
-// This describes the characteristics of a family of kernels, in terms of
-// the required interleave properties and the output block size.
-//
-// All kernels in the family must share these characteristics.  The actual
-// kernel to be used can be chosen at runtime, based on the CPU_type
-// structure.
-class gemm_u16_12x8 {
+class cls_a64_gemm_u16_8x12 {
 public:
     typedef uint16_t operand_type;
     typedef uint32_t result_type;
@@ -62,10 +54,11 @@ public:
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
 
-    kern_type kernel = a64_gemm_u16_asimd_12x8;
+    kern_type kernel = a64_gemm_u16_asimd_8x12;
 
-    gemm_u16_12x8(const CPUInfo *) { }
+    cls_a64_gemm_u16_8x12(const CPUInfo *) { }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp
index 66f0b7c0ac..98da7830f0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+void a64_gemm_u16_asimd_8x12(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
 {
   const uint16_t *a_ptr = Apanel;
   uint32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index 134007b74c..854b6751c1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
@@ -32,7 +32,7 @@ namespace arm_gemm {
 // Kernel definition
 void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K);
 
-class gemm_u8_4x4 {
+class cls_a64_gemm_u8_4x4 {
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
@@ -64,10 +64,11 @@ public:
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
 
     kern_type kernel = a64_gemm_u8_4x4;
 
-    gemm_u8_4x4(const CPUInfo *) { }
+    cls_a64_gemm_u8_4x4(const CPUInfo *) { }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
index c0990ecd57..256ba2e08c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
@@ -30,11 +30,11 @@
 namespace arm_gemm {
 
 // Load the actual kernel
-void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-void a64_gemm_u8_12x8_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_8x12(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_8x12_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_8x12_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 
-class gemm_u8_12x8 {
+class cls_a64_gemm_u8_8x12 {
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
@@ -66,16 +66,17 @@ public:
 
     // Use the standard fixed sized transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
 
-    kern_type kernel = a64_gemm_u8_12x8;
+    kern_type kernel = a64_gemm_u8_8x12;
 
-    gemm_u8_12x8(const CPUInfo *ci) {
+    cls_a64_gemm_u8_8x12(const CPUInfo *ci) {
         auto mod = ci->get_cpu_model();
 
         if (mod == CPUModel::A55r1) {
-            kernel = a64_gemm_u8_12x8_a55r1;
+            kernel = a64_gemm_u8_8x12_a55r1;
         } else if (mod == CPUModel::X1) {
-            kernel = a64_gemm_u8_12x8_x1;
+            kernel = a64_gemm_u8_8x12_x1;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp
index c9a8a8229c..63869c9fd4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
+void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
     const uint8_t *a_ptr = Apanel;
     uint32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp
index 821e742f90..ff60cbc905 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_u8_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
     const uint8_t *a_ptr = Apanel;
     uint32_t *c_ptr = Cpanel;
     // We divide K by 4 because the udot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp
index 7fac67354f..1c1196b7a6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_u8_12x8_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_u8_8x12_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
     const uint8_t *a_ptr = Apanel;
     uint32_t *c_ptr = Cpanel;
     // We divide K by 4 because the udot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp
index b60401b70d..b53172509e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp
@@ -25,32 +25,26 @@
 
 #ifdef __aarch64__
 
-
+#include "../performance_parameters.hpp"
 #include "../std_transforms_fixed.hpp"
 
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_hybrid_fp32_mla_4x8(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_gemv_fp32_mla_32(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
 
-class hybrid_fp32_mla_4x8
+class cls_a64_gemv_fp32_mla_32
 {
 public:
     typedef float operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 8;
-    }
+    typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
 
     static unsigned int out_width()
     {
-        return 4;
+        return 32;
     }
 
     static constexpr unsigned int k_unroll()
@@ -73,14 +67,13 @@ public:
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 1, 32, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_hybrid_fp32_mla_4x8;
+    kern_type kernel=a64_gemv_fp32_mla_32;
 
-    hybrid_fp32_mla_4x8(const CPUInfo *)
+    cls_a64_gemv_fp32_mla_32(const CPUInfo *)
     {
-
     }
 };
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp
new file mode 100644
index 0000000000..a2af8d6d14
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp
@@ -0,0 +1,1546 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_gemv_fp32_mla_32 (
+    const float *A_ptr, const float *B_ptr, float *output_ptr,
+    size_t N, size_t K,
+    const float *bias, Activation act, bool
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "add x22, %x[N], #0x3\n"
+      "mov x21, %x[bias]\n"
+      "lsr x22, x22, #0x2\n"
+      "1:"  // Column loop
+      "cmp x22, #0x8\n"
+      "bge 85f\n"
+      "cmp x22, #0x6\n"
+      "bgt 73f\n"
+      "beq 61f\n"
+      "cmp x22, #0x4\n"
+      "bgt 49f\n"
+      "beq 37f\n"
+      "cmp x22, #0x2\n"
+      "bgt 25f\n"
+      "beq 13f\n"
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 2f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "b 3f\n"
+      "2:"  // Width 1: no bias
+      "movi v24.16b, #0x0\n"
+      "3:"  // Width 1: setup done
+      "cmp x20, #0x4\n"
+      "blt 6f\n"
+      "cmp x20, #0x8\n"
+      "blt 5f\n"
+      "4:"  // Width 1: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q2, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v2.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q3, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v3.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q4, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v4.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x20, x20, #0x4\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "cmp x20, #0x8\n"
+      "bge 4b\n"
+      "5:"  // Width 1: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q5, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v5.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q6, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v6.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q7, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v7.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q8, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "6:"  // Width 1: Multiply loop: Main loop skip
+      "cbz x20, 8f\n"
+      "7:"  // Width 1: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q9, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v9.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "sub x20, x20, #0x1\n"
+      "cbnz x20, 7b\n"
+      "8:"  // Width 1: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 9f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "9:"  // Width 1: No activation
+      "cmp %x[N], #0x4\n"
+      "blt 10f\n"
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 12f\n"
+      "10:"  // Width 1: Partial writeback
+      "tbz %x[N], #1, 11f\n"
+      "str d24, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 12f\n"
+      "st1 { v24.s }[2], [%x[output_ptr]]\n"
+      "b 12f\n"
+      "11:"  // Width 1: Partial direct writeback: partial_1_0
+      "str s24, [%x[output_ptr], #0x0]\n"
+      "12:"  // Width 1: Writeback done
+      "b 97f\n"
+      "13:"  // Width 2
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 14f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "add x21, x21, #0x20\n"
+      "b 15f\n"
+      "14:"  // Width 2: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "15:"  // Width 2: setup done
+      "cmp x20, #0x4\n"
+      "blt 18f\n"
+      "cmp x20, #0x8\n"
+      "blt 17f\n"
+      "16:"  // Width 2: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q3, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v3.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q4, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v4.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q5, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v5.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q6, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v6.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q7, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v7.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q8, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v8.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x20, x20, #0x4\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "cmp x20, #0x8\n"
+      "bge 16b\n"
+      "17:"  // Width 2: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q9, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v9.4s, v0.s[0]\n"
+      "ldr q10, [%x[B_ptr], #0x10]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v25.4s, v10.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q11, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v11.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q12, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v12.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q13, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v13.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q14, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v14.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q15, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q16, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v16.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "18:"  // Width 2: Multiply loop: Main loop skip
+      "cbz x20, 20f\n"
+      "19:"  // Width 2: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q17, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v17.4s, v0.s[0]\n"
+      "ldr q18, [%x[B_ptr], #0x10]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v25.4s, v18.4s, v0.s[0]\n"
+      "sub x20, x20, #0x1\n"
+      "cbnz x20, 19b\n"
+      "20:"  // Width 2: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 21f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "21:"  // Width 2: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "cmp %x[N], #0x8\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "blt 22f\n"
+      "str q25, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 24f\n"
+      "22:"  // Width 2: Partial writeback
+      "tbz %x[N], #1, 23f\n"
+      "str d25, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 24f\n"
+      "st1 { v25.s }[2], [%x[output_ptr]]\n"
+      "b 24f\n"
+      "23:"  // Width 2: Partial direct writeback: partial_1_4
+      "tbz %x[N], #0, 24f\n"
+      "str s25, [%x[output_ptr], #0x0]\n"
+      "24:"  // Width 2: Writeback done
+      "b 97f\n"
+      "25:"  // Width 3
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 26f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "add x21, x21, #0x30\n"
+      "b 27f\n"
+      "26:"  // Width 3: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "27:"  // Width 3: setup done
+      "cmp x20, #0x4\n"
+      "blt 30f\n"
+      "cmp x20, #0x8\n"
+      "blt 29f\n"
+      "28:"  // Width 3: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "ldr q3, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v26.4s, v3.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v4.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q5, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v5.4s, v0.s[1]\n"
+      "ldr q6, [%x[B_ptr], #0x20]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v6.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q7, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v7.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q8, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v8.4s, v0.s[2]\n"
+      "ldr q9, [%x[B_ptr], #0x20]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v9.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q10, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v10.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q11, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v11.4s, v0.s[3]\n"
+      "ldr q12, [%x[B_ptr], #0x20]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v12.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x20, x20, #0x4\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "cmp x20, #0x8\n"
+      "bge 28b\n"
+      "29:"  // Width 3: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q13, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v13.4s, v0.s[0]\n"
+      "ldr q14, [%x[B_ptr], #0x10]\n"
+      "ldr q15, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v14.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v26.4s, v15.4s, v0.s[0]\n"
+      "ldr q16, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q17, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v17.4s, v0.s[1]\n"
+      "ldr q18, [%x[B_ptr], #0x20]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v18.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q19, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v19.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q20, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v20.4s, v0.s[2]\n"
+      "ldr q21, [%x[B_ptr], #0x20]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v21.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q22, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v22.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q23, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v23.4s, v0.s[3]\n"
+      "ldr q1, [%x[B_ptr], #0x20]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v1.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "30:"  // Width 3: Multiply loop: Main loop skip
+      "cbz x20, 32f\n"
+      "31:"  // Width 3: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q2, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v2.4s, v0.s[0]\n"
+      "ldr q3, [%x[B_ptr], #0x10]\n"
+      "ldr q4, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v3.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v4.4s, v0.s[0]\n"
+      "sub x20, x20, #0x1\n"
+      "cbnz x20, 31b\n"
+      "32:"  // Width 3: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 33f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "33:"  // Width 3: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "str q25, [%x[output_ptr], #0x10]\n"
+      "cmp %x[N], #0xc\n"
+      "add %x[output_ptr], %x[output_ptr], #0x20\n"
+      "blt 34f\n"
+      "str q26, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 36f\n"
+      "34:"  // Width 3: Partial writeback
+      "tbz %x[N], #1, 35f\n"
+      "str d26, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 36f\n"
+      "st1 { v26.s }[2], [%x[output_ptr]]\n"
+      "b 36f\n"
+      "35:"  // Width 3: Partial direct writeback: partial_1_8
+      "tbz %x[N], #0, 36f\n"
+      "str s26, [%x[output_ptr], #0x0]\n"
+      "36:"  // Width 3: Writeback done
+      "b 97f\n"
+      "37:"  // Width 4
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 38f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "add x21, x21, #0x40\n"
+      "b 39f\n"
+      "38:"  // Width 4: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "39:"  // Width 4: setup done
+      "cmp x20, #0x4\n"
+      "blt 42f\n"
+      "cmp x20, #0x8\n"
+      "blt 41f\n"
+      "40:"  // Width 4: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "ldr q3, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v3.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v4.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q5, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v5.4s, v0.s[1]\n"
+      "ldr q6, [%x[B_ptr], #0x10]\n"
+      "ldr q7, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v6.4s, v0.s[1]\n"
+      "ldr q8, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v7.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v8.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q9, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v9.4s, v0.s[2]\n"
+      "ldr q10, [%x[B_ptr], #0x10]\n"
+      "ldr q11, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v10.4s, v0.s[2]\n"
+      "ldr q12, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v11.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v12.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q13, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v13.4s, v0.s[3]\n"
+      "ldr q14, [%x[B_ptr], #0x10]\n"
+      "ldr q15, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v14.4s, v0.s[3]\n"
+      "ldr q16, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v15.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v16.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x8\n"
+      "bge 40b\n"
+      "41:"  // Width 4: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q17, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v17.4s, v0.s[0]\n"
+      "ldr q18, [%x[B_ptr], #0x10]\n"
+      "ldr q19, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v18.4s, v0.s[0]\n"
+      "ldr q20, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v19.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v20.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q21, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v21.4s, v0.s[1]\n"
+      "ldr q22, [%x[B_ptr], #0x10]\n"
+      "ldr q23, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v22.4s, v0.s[1]\n"
+      "ldr q1, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v23.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v1.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q2, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v2.4s, v0.s[2]\n"
+      "ldr q3, [%x[B_ptr], #0x10]\n"
+      "ldr q4, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v3.4s, v0.s[2]\n"
+      "ldr q5, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v4.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v5.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q6, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v6.4s, v0.s[3]\n"
+      "ldr q7, [%x[B_ptr], #0x10]\n"
+      "ldr q8, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v7.4s, v0.s[3]\n"
+      "ldr q9, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v8.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v9.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "42:"  // Width 4: Multiply loop: Main loop skip
+      "cbz x20, 44f\n"
+      "43:"  // Width 4: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q10, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v10.4s, v0.s[0]\n"
+      "ldr q11, [%x[B_ptr], #0x10]\n"
+      "ldr q12, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v11.4s, v0.s[0]\n"
+      "ldr q13, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v12.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "sub x20, x20, #0x1\n"
+      "fmla v27.4s, v13.4s, v0.s[0]\n"
+      "cbnz x20, 43b\n"
+      "44:"  // Width 4: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 45f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "45:"  // Width 4: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "str q25, [%x[output_ptr], #0x10]\n"
+      "str q26, [%x[output_ptr], #0x20]\n"
+      "cmp %x[N], #0x10\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "blt 46f\n"
+      "str q27, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 48f\n"
+      "46:"  // Width 4: Partial writeback
+      "tbz %x[N], #1, 47f\n"
+      "str d27, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 48f\n"
+      "st1 { v27.s }[2], [%x[output_ptr]]\n"
+      "b 48f\n"
+      "47:"  // Width 4: Partial direct writeback: partial_1_12
+      "tbz %x[N], #0, 48f\n"
+      "str s27, [%x[output_ptr], #0x0]\n"
+      "48:"  // Width 4: Writeback done
+      "b 97f\n"
+      "49:"  // Width 5
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 50f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "ldr q28, [x21, #0x40]\n"
+      "add x21, x21, #0x50\n"
+      "b 51f\n"
+      "50:"  // Width 5: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "51:"  // Width 5: setup done
+      "cmp x20, #0x4\n"
+      "blt 54f\n"
+      "cmp x20, #0x8\n"
+      "blt 53f\n"
+      "52:"  // Width 5: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "ldr q3, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v3.4s, v0.s[0]\n"
+      "ldr q5, [%x[B_ptr], #0x40]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v27.4s, v4.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q6, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v5.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q7, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v6.4s, v0.s[1]\n"
+      "ldr q8, [%x[B_ptr], #0x20]\n"
+      "ldr q9, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v7.4s, v0.s[1]\n"
+      "ldr q10, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v8.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v9.4s, v0.s[1]\n"
+      "ldr q11, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v10.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q12, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v11.4s, v0.s[2]\n"
+      "ldr q13, [%x[B_ptr], #0x20]\n"
+      "ldr q14, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v12.4s, v0.s[2]\n"
+      "ldr q15, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v13.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v14.4s, v0.s[2]\n"
+      "ldr q16, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v15.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q17, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v16.4s, v0.s[3]\n"
+      "ldr q18, [%x[B_ptr], #0x20]\n"
+      "ldr q19, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v17.4s, v0.s[3]\n"
+      "ldr q20, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v18.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v19.4s, v0.s[3]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v28.4s, v20.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x20, x20, #0x4\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "cmp x20, #0x8\n"
+      "bge 52b\n"
+      "53:"  // Width 5: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q21, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v21.4s, v0.s[0]\n"
+      "ldr q22, [%x[B_ptr], #0x10]\n"
+      "ldr q23, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v22.4s, v0.s[0]\n"
+      "ldr q1, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v23.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x40]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v27.4s, v1.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q3, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v2.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q4, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v3.4s, v0.s[1]\n"
+      "ldr q5, [%x[B_ptr], #0x20]\n"
+      "ldr q6, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v4.4s, v0.s[1]\n"
+      "ldr q7, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v5.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v6.4s, v0.s[1]\n"
+      "ldr q8, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v7.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q9, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v8.4s, v0.s[2]\n"
+      "ldr q10, [%x[B_ptr], #0x20]\n"
+      "ldr q11, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v9.4s, v0.s[2]\n"
+      "ldr q12, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v10.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v11.4s, v0.s[2]\n"
+      "ldr q13, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v12.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q14, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v13.4s, v0.s[3]\n"
+      "ldr q15, [%x[B_ptr], #0x20]\n"
+      "ldr q16, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v14.4s, v0.s[3]\n"
+      "ldr q17, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v15.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v16.4s, v0.s[3]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v28.4s, v17.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "54:"  // Width 5: Multiply loop: Main loop skip
+      "cbz x20, 56f\n"
+      "55:"  // Width 5: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q18, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v18.4s, v0.s[0]\n"
+      "ldr q19, [%x[B_ptr], #0x10]\n"
+      "ldr q20, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v19.4s, v0.s[0]\n"
+      "ldr q21, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v20.4s, v0.s[0]\n"
+      "ldr q22, [%x[B_ptr], #0x40]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v27.4s, v21.4s, v0.s[0]\n"
+      "sub x20, x20, #0x1\n"
+      "fmla v28.4s, v22.4s, v0.s[0]\n"
+      "cbnz x20, 55b\n"
+      "56:"  // Width 5: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 57f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "57:"  // Width 5: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "str q25, [%x[output_ptr], #0x10]\n"
+      "str q26, [%x[output_ptr], #0x20]\n"
+      "str q27, [%x[output_ptr], #0x30]\n"
+      "cmp %x[N], #0x14\n"
+      "add %x[output_ptr], %x[output_ptr], #0x40\n"
+      "blt 58f\n"
+      "str q28, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 60f\n"
+      "58:"  // Width 5: Partial writeback
+      "tbz %x[N], #1, 59f\n"
+      "str d28, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 60f\n"
+      "st1 { v28.s }[2], [%x[output_ptr]]\n"
+      "b 60f\n"
+      "59:"  // Width 5: Partial direct writeback: partial_1_16
+      "tbz %x[N], #0, 60f\n"
+      "str s28, [%x[output_ptr], #0x0]\n"
+      "60:"  // Width 5: Writeback done
+      "b 97f\n"
+      "61:"  // Width 6
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 62f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "ldr q28, [x21, #0x40]\n"
+      "ldr q29, [x21, #0x50]\n"
+      "add x21, x21, #0x60\n"
+      "b 63f\n"
+      "62:"  // Width 6: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "63:"  // Width 6: setup done
+      "cmp x20, #0x4\n"
+      "blt 66f\n"
+      "cmp x20, #0x8\n"
+      "blt 65f\n"
+      "64:"  // Width 6: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "ldr q3, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v3.4s, v0.s[0]\n"
+      "ldr q5, [%x[B_ptr], #0x40]\n"
+      "ldr q6, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v4.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v5.4s, v0.s[0]\n"
+      "ldr q7, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v6.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q8, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v7.4s, v0.s[1]\n"
+      "ldr q9, [%x[B_ptr], #0x20]\n"
+      "ldr q10, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v8.4s, v0.s[1]\n"
+      "ldr q11, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v9.4s, v0.s[1]\n"
+      "ldr q12, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v10.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v11.4s, v0.s[1]\n"
+      "ldr q13, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v12.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q14, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v13.4s, v0.s[2]\n"
+      "ldr q15, [%x[B_ptr], #0x20]\n"
+      "ldr q16, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v14.4s, v0.s[2]\n"
+      "ldr q17, [%x[B_ptr], #0x40]\n"
+      "ldr q18, [%x[B_ptr], #0x50]\n"
+      "fmla v26.4s, v15.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v16.4s, v0.s[2]\n"
+      "ldr q19, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v17.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q20, [%x[B_ptr], #0x10]\n"
+      "fmla v29.4s, v18.4s, v0.s[2]\n"
+      "ldr q21, [%x[B_ptr], #0x20]\n"
+      "ldr q22, [%x[B_ptr], #0x30]\n"
+      "fmla v24.4s, v19.4s, v0.s[3]\n"
+      "ldr q23, [%x[B_ptr], #0x40]\n"
+      "ldr q1, [%x[B_ptr], #0x50]\n"
+      "fmla v25.4s, v20.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v26.4s, v21.4s, v0.s[3]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v27.4s, v22.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x20, x20, #0x4\n"
+      "fmla v28.4s, v23.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "cmp x20, #0x8\n"
+      "fmla v29.4s, v1.4s, v0.s[3]\n"
+      "bge 64b\n"
+      "65:"  // Width 6: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q2, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v2.4s, v0.s[0]\n"
+      "ldr q3, [%x[B_ptr], #0x10]\n"
+      "ldr q4, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v3.4s, v0.s[0]\n"
+      "ldr q5, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v4.4s, v0.s[0]\n"
+      "ldr q6, [%x[B_ptr], #0x40]\n"
+      "ldr q7, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v5.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v6.4s, v0.s[0]\n"
+      "ldr q8, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q9, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v8.4s, v0.s[1]\n"
+      "ldr q10, [%x[B_ptr], #0x20]\n"
+      "ldr q11, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v9.4s, v0.s[1]\n"
+      "ldr q12, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v10.4s, v0.s[1]\n"
+      "ldr q13, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v11.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v12.4s, v0.s[1]\n"
+      "ldr q14, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v13.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q15, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "ldr q16, [%x[B_ptr], #0x20]\n"
+      "ldr q17, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v15.4s, v0.s[2]\n"
+      "ldr q18, [%x[B_ptr], #0x40]\n"
+      "ldr q19, [%x[B_ptr], #0x50]\n"
+      "fmla v26.4s, v16.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v17.4s, v0.s[2]\n"
+      "ldr q20, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v18.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q21, [%x[B_ptr], #0x10]\n"
+      "fmla v29.4s, v19.4s, v0.s[2]\n"
+      "ldr q22, [%x[B_ptr], #0x20]\n"
+      "ldr q23, [%x[B_ptr], #0x30]\n"
+      "fmla v24.4s, v20.4s, v0.s[3]\n"
+      "ldr q1, [%x[B_ptr], #0x40]\n"
+      "ldr q2, [%x[B_ptr], #0x50]\n"
+      "fmla v25.4s, v21.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v26.4s, v22.4s, v0.s[3]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v27.4s, v23.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "fmla v28.4s, v1.4s, v0.s[3]\n"
+      "fmla v29.4s, v2.4s, v0.s[3]\n"
+      "66:"  // Width 6: Multiply loop: Main loop skip
+      "cbz x20, 68f\n"
+      "67:"  // Width 6: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q3, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v3.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x10]\n"
+      "ldr q5, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v4.4s, v0.s[0]\n"
+      "ldr q6, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v5.4s, v0.s[0]\n"
+      "ldr q7, [%x[B_ptr], #0x40]\n"
+      "ldr q8, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v6.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "sub x20, x20, #0x1\n"
+      "fmla v28.4s, v7.4s, v0.s[0]\n"
+      "fmla v29.4s, v8.4s, v0.s[0]\n"
+      "cbnz x20, 67b\n"
+      "68:"  // Width 6: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 69f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "fmax v29.4s, v29.4s, v17.4s\n"
+      "69:"  // Width 6: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "str q25, [%x[output_ptr], #0x10]\n"
+      "str q26, [%x[output_ptr], #0x20]\n"
+      "str q27, [%x[output_ptr], #0x30]\n"
+      "str q28, [%x[output_ptr], #0x40]\n"
+      "cmp %x[N], #0x18\n"
+      "add %x[output_ptr], %x[output_ptr], #0x50\n"
+      "blt 70f\n"
+      "str q29, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 72f\n"
+      "70:"  // Width 6: Partial writeback
+      "tbz %x[N], #1, 71f\n"
+      "str d29, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 72f\n"
+      "st1 { v29.s }[2], [%x[output_ptr]]\n"
+      "b 72f\n"
+      "71:"  // Width 6: Partial direct writeback: partial_1_20
+      "tbz %x[N], #0, 72f\n"
+      "str s29, [%x[output_ptr], #0x0]\n"
+      "72:"  // Width 6: Writeback done
+      "b 97f\n"
+      "73:"  // Width 7
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 74f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "ldr q28, [x21, #0x40]\n"
+      "ldr q29, [x21, #0x50]\n"
+      "ldr q30, [x21, #0x60]\n"
+      "add x21, x21, #0x70\n"
+      "b 75f\n"
+      "74:"  // Width 7: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "75:"  // Width 7: setup done
+      "cmp x20, #0x4\n"
+      "blt 78f\n"
+      "cmp x20, #0x8\n"
+      "blt 77f\n"
+      "76:"  // Width 7: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "ldr q3, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v3.4s, v0.s[0]\n"
+      "ldr q5, [%x[B_ptr], #0x40]\n"
+      "ldr q6, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v4.4s, v0.s[0]\n"
+      "ldr q7, [%x[B_ptr], #0x60]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v28.4s, v5.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v29.4s, v6.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q8, [%x[B_ptr], #0x0]\n"
+      "fmla v30.4s, v7.4s, v0.s[0]\n"
+      "ldr q9, [%x[B_ptr], #0x10]\n"
+      "ldr q10, [%x[B_ptr], #0x20]\n"
+      "fmla v24.4s, v8.4s, v0.s[1]\n"
+      "ldr q11, [%x[B_ptr], #0x30]\n"
+      "ldr q12, [%x[B_ptr], #0x40]\n"
+      "fmla v25.4s, v9.4s, v0.s[1]\n"
+      "ldr q13, [%x[B_ptr], #0x50]\n"
+      "fmla v26.4s, v10.4s, v0.s[1]\n"
+      "ldr q14, [%x[B_ptr], #0x60]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v27.4s, v11.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v12.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q15, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v13.4s, v0.s[1]\n"
+      "ldr q16, [%x[B_ptr], #0x10]\n"
+      "ldr q17, [%x[B_ptr], #0x20]\n"
+      "fmla v30.4s, v14.4s, v0.s[1]\n"
+      "ldr q18, [%x[B_ptr], #0x30]\n"
+      "fmla v24.4s, v15.4s, v0.s[2]\n"
+      "ldr q19, [%x[B_ptr], #0x40]\n"
+      "ldr q20, [%x[B_ptr], #0x50]\n"
+      "fmla v25.4s, v16.4s, v0.s[2]\n"
+      "ldr q21, [%x[B_ptr], #0x60]\n"
+      "fmla v26.4s, v17.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v18.4s, v0.s[2]\n"
+      "ldr q22, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v19.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q23, [%x[B_ptr], #0x10]\n"
+      "fmla v29.4s, v20.4s, v0.s[2]\n"
+      "ldr q1, [%x[B_ptr], #0x20]\n"
+      "ldr q2, [%x[B_ptr], #0x30]\n"
+      "fmla v30.4s, v21.4s, v0.s[2]\n"
+      "ldr q3, [%x[B_ptr], #0x40]\n"
+      "fmla v24.4s, v22.4s, v0.s[3]\n"
+      "ldr q4, [%x[B_ptr], #0x50]\n"
+      "ldr q5, [%x[B_ptr], #0x60]\n"
+      "fmla v25.4s, v23.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v1.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v2.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v28.4s, v3.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "sub x20, x20, #0x4\n"
+      "fmla v29.4s, v4.4s, v0.s[3]\n"
+      "cmp x20, #0x8\n"
+      "fmla v30.4s, v5.4s, v0.s[3]\n"
+      "bge 76b\n"
+      "77:"  // Width 7: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q6, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [%x[B_ptr], #0x10]\n"
+      "ldr q8, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v7.4s, v0.s[0]\n"
+      "ldr q9, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v8.4s, v0.s[0]\n"
+      "ldr q10, [%x[B_ptr], #0x40]\n"
+      "ldr q11, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v9.4s, v0.s[0]\n"
+      "ldr q12, [%x[B_ptr], #0x60]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v28.4s, v10.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v29.4s, v11.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q13, [%x[B_ptr], #0x0]\n"
+      "fmla v30.4s, v12.4s, v0.s[0]\n"
+      "ldr q14, [%x[B_ptr], #0x10]\n"
+      "ldr q15, [%x[B_ptr], #0x20]\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "ldr q16, [%x[B_ptr], #0x30]\n"
+      "ldr q17, [%x[B_ptr], #0x40]\n"
+      "fmla v25.4s, v14.4s, v0.s[1]\n"
+      "ldr q18, [%x[B_ptr], #0x50]\n"
+      "fmla v26.4s, v15.4s, v0.s[1]\n"
+      "ldr q19, [%x[B_ptr], #0x60]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v27.4s, v16.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v17.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q20, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v18.4s, v0.s[1]\n"
+      "ldr q21, [%x[B_ptr], #0x10]\n"
+      "ldr q22, [%x[B_ptr], #0x20]\n"
+      "fmla v30.4s, v19.4s, v0.s[1]\n"
+      "ldr q23, [%x[B_ptr], #0x30]\n"
+      "fmla v24.4s, v20.4s, v0.s[2]\n"
+      "ldr q1, [%x[B_ptr], #0x40]\n"
+      "ldr q2, [%x[B_ptr], #0x50]\n"
+      "fmla v25.4s, v21.4s, v0.s[2]\n"
+      "ldr q3, [%x[B_ptr], #0x60]\n"
+      "fmla v26.4s, v22.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v23.4s, v0.s[2]\n"
+      "ldr q4, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v1.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q5, [%x[B_ptr], #0x10]\n"
+      "fmla v29.4s, v2.4s, v0.s[2]\n"
+      "ldr q6, [%x[B_ptr], #0x20]\n"
+      "ldr q7, [%x[B_ptr], #0x30]\n"
+      "fmla v30.4s, v3.4s, v0.s[2]\n"
+      "ldr q8, [%x[B_ptr], #0x40]\n"
+      "fmla v24.4s, v4.4s, v0.s[3]\n"
+      "ldr q9, [%x[B_ptr], #0x50]\n"
+      "ldr q10, [%x[B_ptr], #0x60]\n"
+      "fmla v25.4s, v5.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v6.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v7.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v28.4s, v8.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "fmla v29.4s, v9.4s, v0.s[3]\n"
+      "fmla v30.4s, v10.4s, v0.s[3]\n"
+      "78:"  // Width 7: Multiply loop: Main loop skip
+      "cbz x20, 80f\n"
+      "79:"  // Width 7: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q11, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v11.4s, v0.s[0]\n"
+      "ldr q12, [%x[B_ptr], #0x10]\n"
+      "ldr q13, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v12.4s, v0.s[0]\n"
+      "ldr q14, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v13.4s, v0.s[0]\n"
+      "ldr q15, [%x[B_ptr], #0x40]\n"
+      "ldr q16, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v14.4s, v0.s[0]\n"
+      "ldr q17, [%x[B_ptr], #0x60]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v28.4s, v15.4s, v0.s[0]\n"
+      "fmla v29.4s, v16.4s, v0.s[0]\n"
+      "sub x20, x20, #0x1\n"
+      "fmla v30.4s, v17.4s, v0.s[0]\n"
+      "cbnz x20, 79b\n"
+      "80:"  // Width 7: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 81f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v16.4s\n"
+      "fmin v30.4s, v30.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "fmax v29.4s, v29.4s, v17.4s\n"
+      "fmax v30.4s, v30.4s, v17.4s\n"
+      "81:"  // Width 7: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "str q25, [%x[output_ptr], #0x10]\n"
+      "str q26, [%x[output_ptr], #0x20]\n"
+      "str q27, [%x[output_ptr], #0x30]\n"
+      "str q28, [%x[output_ptr], #0x40]\n"
+      "str q29, [%x[output_ptr], #0x50]\n"
+      "cmp %x[N], #0x1c\n"
+      "add %x[output_ptr], %x[output_ptr], #0x60\n"
+      "blt 82f\n"
+      "str q30, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 84f\n"
+      "82:"  // Width 7: Partial writeback
+      "tbz %x[N], #1, 83f\n"
+      "str d30, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 84f\n"
+      "st1 { v30.s }[2], [%x[output_ptr]]\n"
+      "b 84f\n"
+      "83:"  // Width 7: Partial direct writeback: partial_1_24
+      "tbz %x[N], #0, 84f\n"
+      "str s30, [%x[output_ptr], #0x0]\n"
+      "84:"  // Width 7: Writeback done
+      "b 97f\n"
+      "85:"  // Width 8
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 86f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "ldr q28, [x21, #0x40]\n"
+      "ldr q29, [x21, #0x50]\n"
+      "ldr q30, [x21, #0x60]\n"
+      "ldr q31, [x21, #0x70]\n"
+      "add x21, x21, #0x80\n"
+      "b 87f\n"
+      "86:"  // Width 8: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "87:"  // Width 8: setup done
+      "cmp x20, #0x4\n"
+      "blt 90f\n"
+      "cmp x20, #0x8\n"
+      "blt 89f\n"
+      "88:"  // Width 8: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "ldr q3, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v3.4s, v0.s[0]\n"
+      "ldr q5, [%x[B_ptr], #0x40]\n"
+      "ldr q6, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v4.4s, v0.s[0]\n"
+      "ldr q7, [%x[B_ptr], #0x60]\n"
+      "ldr q8, [%x[B_ptr], #0x70]\n"
+      "fmla v28.4s, v5.4s, v0.s[0]\n"
+      "fmla v29.4s, v6.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v30.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q9, [%x[B_ptr], #0x0]\n"
+      "fmla v31.4s, v8.4s, v0.s[0]\n"
+      "ldr q10, [%x[B_ptr], #0x10]\n"
+      "ldr q11, [%x[B_ptr], #0x20]\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "ldr q12, [%x[B_ptr], #0x30]\n"
+      "ldr q13, [%x[B_ptr], #0x40]\n"
+      "fmla v25.4s, v10.4s, v0.s[1]\n"
+      "fmla v26.4s, v11.4s, v0.s[1]\n"
+      "ldr q14, [%x[B_ptr], #0x50]\n"
+      "ldr q15, [%x[B_ptr], #0x60]\n"
+      "fmla v27.4s, v12.4s, v0.s[1]\n"
+      "ldr q16, [%x[B_ptr], #0x70]\n"
+      "fmla v28.4s, v13.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v29.4s, v14.4s, v0.s[1]\n"
+      "ldr q17, [%x[B_ptr], #0x0]\n"
+      "fmla v30.4s, v15.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q18, [%x[B_ptr], #0x10]\n"
+      "fmla v31.4s, v16.4s, v0.s[1]\n"
+      "ldr q19, [%x[B_ptr], #0x20]\n"
+      "ldr q20, [%x[B_ptr], #0x30]\n"
+      "fmla v24.4s, v17.4s, v0.s[2]\n"
+      "ldr q21, [%x[B_ptr], #0x40]\n"
+      "ldr q22, [%x[B_ptr], #0x50]\n"
+      "fmla v25.4s, v18.4s, v0.s[2]\n"
+      "ldr q23, [%x[B_ptr], #0x60]\n"
+      "fmla v26.4s, v19.4s, v0.s[2]\n"
+      "ldr q1, [%x[B_ptr], #0x70]\n"
+      "fmla v27.4s, v20.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v21.4s, v0.s[2]\n"
+      "ldr q2, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v22.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q3, [%x[B_ptr], #0x10]\n"
+      "fmla v30.4s, v23.4s, v0.s[2]\n"
+      "ldr q4, [%x[B_ptr], #0x20]\n"
+      "ldr q5, [%x[B_ptr], #0x30]\n"
+      "fmla v31.4s, v1.4s, v0.s[2]\n"
+      "ldr q6, [%x[B_ptr], #0x40]\n"
+      "fmla v24.4s, v2.4s, v0.s[3]\n"
+      "ldr q7, [%x[B_ptr], #0x50]\n"
+      "ldr q8, [%x[B_ptr], #0x60]\n"
+      "fmla v25.4s, v3.4s, v0.s[3]\n"
+      "ldr q9, [%x[B_ptr], #0x70]\n"
+      "fmla v26.4s, v4.4s, v0.s[3]\n"
+      "fmla v27.4s, v5.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v6.4s, v0.s[3]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v29.4s, v7.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x20, x20, #0x4\n"
+      "fmla v30.4s, v8.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "cmp x20, #0x8\n"
+      "fmla v31.4s, v9.4s, v0.s[3]\n"
+      "bge 88b\n"
+      "89:"  // Width 8: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q10, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v10.4s, v0.s[0]\n"
+      "ldr q11, [%x[B_ptr], #0x10]\n"
+      "ldr q12, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v11.4s, v0.s[0]\n"
+      "ldr q13, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v12.4s, v0.s[0]\n"
+      "ldr q14, [%x[B_ptr], #0x40]\n"
+      "ldr q15, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v13.4s, v0.s[0]\n"
+      "ldr q16, [%x[B_ptr], #0x60]\n"
+      "ldr q17, [%x[B_ptr], #0x70]\n"
+      "fmla v28.4s, v14.4s, v0.s[0]\n"
+      "fmla v29.4s, v15.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v30.4s, v16.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q18, [%x[B_ptr], #0x0]\n"
+      "fmla v31.4s, v17.4s, v0.s[0]\n"
+      "ldr q19, [%x[B_ptr], #0x10]\n"
+      "ldr q20, [%x[B_ptr], #0x20]\n"
+      "fmla v24.4s, v18.4s, v0.s[1]\n"
+      "ldr q21, [%x[B_ptr], #0x30]\n"
+      "ldr q22, [%x[B_ptr], #0x40]\n"
+      "fmla v25.4s, v19.4s, v0.s[1]\n"
+      "fmla v26.4s, v20.4s, v0.s[1]\n"
+      "ldr q23, [%x[B_ptr], #0x50]\n"
+      "ldr q1, [%x[B_ptr], #0x60]\n"
+      "fmla v27.4s, v21.4s, v0.s[1]\n"
+      "ldr q2, [%x[B_ptr], #0x70]\n"
+      "fmla v28.4s, v22.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v29.4s, v23.4s, v0.s[1]\n"
+      "ldr q3, [%x[B_ptr], #0x0]\n"
+      "fmla v30.4s, v1.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q4, [%x[B_ptr], #0x10]\n"
+      "fmla v31.4s, v2.4s, v0.s[1]\n"
+      "ldr q5, [%x[B_ptr], #0x20]\n"
+      "ldr q6, [%x[B_ptr], #0x30]\n"
+      "fmla v24.4s, v3.4s, v0.s[2]\n"
+      "ldr q7, [%x[B_ptr], #0x40]\n"
+      "ldr q8, [%x[B_ptr], #0x50]\n"
+      "fmla v25.4s, v4.4s, v0.s[2]\n"
+      "ldr q9, [%x[B_ptr], #0x60]\n"
+      "fmla v26.4s, v5.4s, v0.s[2]\n"
+      "ldr q10, [%x[B_ptr], #0x70]\n"
+      "fmla v27.4s, v6.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v7.4s, v0.s[2]\n"
+      "ldr q11, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v8.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q12, [%x[B_ptr], #0x10]\n"
+      "fmla v30.4s, v9.4s, v0.s[2]\n"
+      "ldr q13, [%x[B_ptr], #0x20]\n"
+      "ldr q14, [%x[B_ptr], #0x30]\n"
+      "fmla v31.4s, v10.4s, v0.s[2]\n"
+      "ldr q15, [%x[B_ptr], #0x40]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "ldr q16, [%x[B_ptr], #0x50]\n"
+      "ldr q17, [%x[B_ptr], #0x60]\n"
+      "fmla v25.4s, v12.4s, v0.s[3]\n"
+      "ldr q18, [%x[B_ptr], #0x70]\n"
+      "fmla v26.4s, v13.4s, v0.s[3]\n"
+      "fmla v27.4s, v14.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v15.4s, v0.s[3]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v29.4s, v16.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla v30.4s, v17.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "fmla v31.4s, v18.4s, v0.s[3]\n"
+      "90:"  // Width 8: Multiply loop: Main loop skip
+      "cbz x20, 92f\n"
+      "91:"  // Width 8: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q19, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v19.4s, v0.s[0]\n"
+      "ldr q20, [%x[B_ptr], #0x10]\n"
+      "ldr q21, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v20.4s, v0.s[0]\n"
+      "ldr q22, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v21.4s, v0.s[0]\n"
+      "ldr q23, [%x[B_ptr], #0x40]\n"
+      "ldr q1, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v22.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x60]\n"
+      "ldr q3, [%x[B_ptr], #0x70]\n"
+      "fmla v28.4s, v23.4s, v0.s[0]\n"
+      "fmla v29.4s, v1.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "sub x20, x20, #0x1\n"
+      "fmla v30.4s, v2.4s, v0.s[0]\n"
+      "fmla v31.4s, v3.4s, v0.s[0]\n"
+      "cbnz x20, 91b\n"
+      "92:"  // Width 8: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 93f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v16.4s\n"
+      "fmin v30.4s, v30.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "fmax v29.4s, v29.4s, v17.4s\n"
+      "fmax v30.4s, v30.4s, v17.4s\n"
+      "fmin v31.4s, v31.4s, v16.4s\n"
+      "fmax v31.4s, v31.4s, v17.4s\n"
+      "93:"  // Width 8: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "str q25, [%x[output_ptr], #0x10]\n"
+      "str q26, [%x[output_ptr], #0x20]\n"
+      "str q27, [%x[output_ptr], #0x30]\n"
+      "str q28, [%x[output_ptr], #0x40]\n"
+      "str q29, [%x[output_ptr], #0x50]\n"
+      "str q30, [%x[output_ptr], #0x60]\n"
+      "cmp %x[N], #0x20\n"
+      "add %x[output_ptr], %x[output_ptr], #0x70\n"
+      "blt 94f\n"
+      "str q31, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 96f\n"
+      "94:"  // Width 8: Partial writeback
+      "tbz %x[N], #1, 95f\n"
+      "str d31, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 96f\n"
+      "st1 { v31.s }[2], [%x[output_ptr]]\n"
+      "b 96f\n"
+      "95:"  // Width 8: Partial direct writeback: partial_1_28
+      "tbz %x[N], #0, 96f\n"
+      "str s31, [%x[output_ptr], #0x0]\n"
+      "96:"  // Width 8: Writeback done
+      "subs x22, x22, #0x8\n"
+      "sub %x[N], %x[N], #0x20\n"
+      "bgt 1b\n"
+      "97:"  // Exit
+
+      : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
+      : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
index 79cae6002a..73fb5b7122 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
@@ -30,15 +30,15 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-void a64_hgemm_asimd_24x8_x1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_8x24(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_8x24_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_8x24_x1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
 
-// 24x8 HGEMM "strategy" class.  Describes the kernel properties.
+// 8x24 HGEMM "strategy" class.  Describes the kernel properties.
 //
 // The generic "gemm_opt" function will instantiate one of these (allowing
 // the constructor to pick a kernel implementation).
-class hgemm_24x8 {
+class cls_a64_hgemm_8x24 {
 public:
     typedef __fp16 operand_type;
     typedef __fp16 result_type;
@@ -62,15 +62,15 @@ public:
     StdTransformsFixed<operand_type, result_type, 8, 24> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel = a64_hgemm_asimd_24x8;
+    kern_type kernel = a64_hgemm_asimd_8x24;
 
-    hgemm_24x8(const CPUInfo *ci) {
+    cls_a64_hgemm_8x24(const CPUInfo *ci) {
         auto model = ci->get_cpu_model();
 
         if (model == CPUModel::A55r1) {
-            kernel = a64_hgemm_asimd_24x8_a55r1;
+            kernel = a64_hgemm_asimd_8x24_a55r1;
         } else if (model == CPUModel::X1) {
-            kernel = a64_hgemm_asimd_24x8_x1;
+            kernel = a64_hgemm_asimd_8x24_x1;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
index 829ae30001..29cdd33893 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
@@ -41,7 +41,7 @@
 
 namespace arm_gemm {
 
-void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void a64_hgemm_asimd_8x24_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
     const __fp16 *a_ptr = Apanel;
     __fp16 *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
index 657fade944..c9c48dd1c0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
@@ -34,14 +34,14 @@
 // Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
 // Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
 // Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 24x8), the chunks being arranged in a row major fashion.
+// 8x24), the chunks being arranged in a row major fashion.
 //
 // Note that the intent of this is that either ablocks or bblocks will be 1
 // - this construction allows the output loop to proceed in either order.
 
 namespace arm_gemm {
 
-void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
     const __fp16 *a_ptr = Apanel;
     __fp16 *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
index 3bb8334126..a6d2405e7e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
@@ -34,14 +34,14 @@
 // Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
 // Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
 // Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 24x8), the chunks being arranged in a row major fashion.
+// 8x24), the chunks being arranged in a row major fashion.
 //
 // Note that the intent of this is that either ablocks or bblocks will be 1
 // - this construction allows the output loop to proceed in either order.
 
 namespace arm_gemm {
 
-void a64_hgemm_asimd_24x8_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void a64_hgemm_asimd_8x24_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
     const __fp16 *a_ptr = Apanel;
     __fp16 *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
new file mode 100644
index 0000000000..a76c9949de
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<bfloat16>, \
+   size_t, size_t, \
+   const bfloat16 *, \
+   IndirectOutputArg<float>, \
+   const float *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_bf16fp32_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_bf16fp32_dot_6x16
+{
+public:
+    typedef bfloat16 operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 2;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 6, 16, 2> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_bf16fp32_dot_6x16;
+
+    cls_a64_hybrid_bf16fp32_dot_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..be680ed645
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
@@ -0,0 +1,3668 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_bf16fp32_dot_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 186f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 149f\n"
+      "beq 112f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 75f\n"
+      "beq 38f\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "cbz x14, 4f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "add x14, x14, #0x40\n"
+      "b 15f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 14f\n"
+      "cmp x16, #0x10\n"
+      "bge 13f\n"
+      "tbz x16, #3, 8f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "tbz x16, #2, 6f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 5f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "b 12f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 12f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "b 12f\n"
+      "6:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x16, #1, 7f\n"
+      "ldr d10, [x13], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "b 12f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 12f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "b 12f\n"
+      "8:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x16, #2, 10f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 9f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "b 12f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 12f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "b 12f\n"
+      "10:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x16, #1, 11f\n"
+      "ldr d8, [x13], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "b 12f\n"
+      "11:"  // Height 1: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "12:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 15f\n"
+      "13:"  // Height 1: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "b 15f\n"
+      "14:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "15:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "16:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 17f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 18f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "b 18f\n"
+      "17:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "18:"  // Height 1: input setup done
+      "cmp x11, #0x8\n"
+      "blt 21f\n"
+      "cmp x11, #0x10\n"
+      "blt 20f\n"
+      "19:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "sub x11, x11, #0x8\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "cmp x11, #0x10\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      "bge 19b\n"
+      "20:"  // Height 1: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      "21:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x11, 26f\n"
+      "cmp x11, #0x2\n"
+      "blt 23f\n"
+      "22:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "sub x11, x11, #0x2\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      "cmp x11, #0x2\n"
+      "bge 22b\n"
+      "cbz x11, 26f\n"
+      "23:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 24f\n"
+      "ldr s0, [x10], #0x4\n"
+      "tbz x11, #0, 25f\n"
+      "ld1 { v0.h }[2], [x10]\n"
+      "b 25f\n"
+      "24:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h0, [x10, #0x0]\n"
+      "25:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      "26:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 16b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "27:"  // Height 1: No activation
+      "cmp x16, #0x10\n"
+      "bge 36f\n"
+      "tbz x16, #3, 31f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "tbz x16, #2, 29f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 28f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x16, #0, 35f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "b 35f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 35f\n"
+      "str s11, [x13, #0x0]\n"
+      "b 35f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 30f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x16, #0, 35f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "b 35f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 35f\n"
+      "str s10, [x13, #0x0]\n"
+      "b 35f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 33f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 32f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x16, #0, 35f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "b 35f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 35f\n"
+      "str s9, [x13, #0x0]\n"
+      "b 35f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 34f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x16, #0, 35f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "b 35f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "35:"  // Height 1: Partial direct writeback: Done
+      "b 37f\n"
+      "36:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "37:"  // Height 1: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 3b\n"
+      "b 224f\n"
+      "38:"  // Height 2
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 39f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 40f\n"
+      "39:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "40:"  // Height 2: Column loop
+      "cbz x14, 41f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v15.16b, v11.16b\n"
+      "b 52f\n"
+      "41:"  // Height 2: no bias
+      "tbz %x[flags], #0, 51f\n"
+      "cmp x16, #0x10\n"
+      "bge 50f\n"
+      "tbz x16, #3, 45f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "tbz x16, #2, 43f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 42f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "tbz x16, #0, 49f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "b 49f\n"
+      "42:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 49f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "b 49f\n"
+      "43:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x16, #1, 44f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 49f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "b 49f\n"
+      "44:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 49f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "b 49f\n"
+      "45:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x16, #2, 47f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 46f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "tbz x16, #0, 49f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "b 49f\n"
+      "46:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 49f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "b 49f\n"
+      "47:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x16, #1, 48f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 49f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "b 49f\n"
+      "48:"  // Height 2: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "49:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "b 52f\n"
+      "50:"  // Height 2: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "b 52f\n"
+      "51:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "52:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "53:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 54f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 55f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "b 55f\n"
+      "54:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "55:"  // Height 2: input setup done
+      "cmp x11, #0x8\n"
+      "blt 58f\n"
+      "cmp x11, #0x10\n"
+      "blt 57f\n"
+      "56:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "sub x11, x11, #0x8\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "cmp x11, #0x10\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      "bge 56b\n"
+      "57:"  // Height 2: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      "58:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x11, 63f\n"
+      "cmp x11, #0x2\n"
+      "blt 60f\n"
+      "59:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "sub x11, x11, #0x2\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "cmp x11, #0x2\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      "bge 59b\n"
+      "cbz x11, 63f\n"
+      "60:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 61f\n"
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "tbz x11, #0, 62f\n"
+      "ld1 { v0.h }[2], [x10]\n"
+      "ld1 { v1.h }[2], [x28]\n"
+      "b 62f\n"
+      "61:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h0, [x10, #0x0]\n"
+      "ldr h1, [x28, #0x0]\n"
+      "62:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      "63:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 53b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 64f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "64:"  // Height 2: No activation
+      "cmp x16, #0x10\n"
+      "bge 73f\n"
+      "tbz x16, #3, 68f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "tbz x16, #2, 66f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 65f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "tbz x16, #0, 72f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "b 72f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 72f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "b 72f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 67f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "tbz x16, #0, 72f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "b 72f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 72f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "b 72f\n"
+      "68:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 70f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 69f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "tbz x16, #0, 72f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "b 72f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 72f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "b 72f\n"
+      "70:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 71f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "tbz x16, #0, 72f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "b 72f\n"
+      "71:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "72:"  // Height 2: Partial direct writeback: Done
+      "b 74f\n"
+      "73:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "74:"  // Height 2: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 40b\n"
+      "b 224f\n"
+      "75:"  // Height 3
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 76f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 77f\n"
+      "76:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "77:"  // Height 3: Column loop
+      "cbz x14, 78f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v13.16b, v9.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "b 89f\n"
+      "78:"  // Height 3: no bias
+      "tbz %x[flags], #0, 88f\n"
+      "cmp x16, #0x10\n"
+      "bge 87f\n"
+      "tbz x16, #3, 82f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "tbz x16, #2, 80f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 79f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "tbz x16, #0, 86f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "b 86f\n"
+      "79:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 86f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "b 86f\n"
+      "80:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x16, #1, 81f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 86f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "b 86f\n"
+      "81:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 86f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "b 86f\n"
+      "82:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x16, #2, 84f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 83f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "tbz x16, #0, 86f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "b 86f\n"
+      "83:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 86f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "b 86f\n"
+      "84:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x16, #1, 85f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 86f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "b 86f\n"
+      "85:"  // Height 3: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "86:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "b 89f\n"
+      "87:"  // Height 3: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "b 89f\n"
+      "88:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "89:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "90:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 91f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 92f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "b 92f\n"
+      "91:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "92:"  // Height 3: input setup done
+      "cmp x11, #0x8\n"
+      "blt 95f\n"
+      "cmp x11, #0x10\n"
+      "blt 94f\n"
+      "93:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "sub x11, x11, #0x8\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      "cmp x11, #0x10\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      "bge 93b\n"
+      "94:"  // Height 3: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      "95:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x11, 100f\n"
+      "cmp x11, #0x2\n"
+      "blt 97f\n"
+      "96:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x2\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "cmp x11, #0x2\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      "bge 96b\n"
+      "cbz x11, 100f\n"
+      "97:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 98f\n"
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "tbz x11, #0, 99f\n"
+      "ld1 { v0.h }[2], [x10]\n"
+      "ld1 { v1.h }[2], [x28]\n"
+      "ld1 { v2.h }[2], [x26]\n"
+      "b 99f\n"
+      "98:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h0, [x10, #0x0]\n"
+      "ldr h1, [x28, #0x0]\n"
+      "ldr h2, [x26, #0x0]\n"
+      "99:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      "100:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 90b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 101f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "101:"  // Height 3: No activation
+      "cmp x16, #0x10\n"
+      "bge 110f\n"
+      "tbz x16, #3, 105f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "tbz x16, #2, 103f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 102f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "tbz x16, #0, 109f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "b 109f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 109f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "b 109f\n"
+      "103:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 104f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "tbz x16, #0, 109f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "b 109f\n"
+      "104:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 109f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "b 109f\n"
+      "105:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 107f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 106f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "tbz x16, #0, 109f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "b 109f\n"
+      "106:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 109f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "b 109f\n"
+      "107:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 108f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "tbz x16, #0, 109f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "b 109f\n"
+      "108:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "109:"  // Height 3: Partial direct writeback: Done
+      "b 111f\n"
+      "110:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "111:"  // Height 3: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 77b\n"
+      "b 224f\n"
+      "112:"  // Height 4
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 113f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 114f\n"
+      "113:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "114:"  // Height 4: Column loop
+      "cbz x14, 115f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "add x14, x14, #0x40\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "b 126f\n"
+      "115:"  // Height 4: no bias
+      "tbz %x[flags], #0, 125f\n"
+      "cmp x16, #0x10\n"
+      "bge 124f\n"
+      "tbz x16, #3, 119f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "tbz x16, #2, 117f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 116f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "tbz x16, #0, 123f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "b 123f\n"
+      "116:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 123f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "b 123f\n"
+      "117:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x16, #1, 118f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 123f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "b 123f\n"
+      "118:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 123f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "b 123f\n"
+      "119:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x16, #2, 121f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 120f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "tbz x16, #0, 123f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "b 123f\n"
+      "120:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 123f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "b 123f\n"
+      "121:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x16, #1, 122f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 123f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "b 123f\n"
+      "122:"  // Height 4: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "123:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "b 126f\n"
+      "124:"  // Height 4: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "b 126f\n"
+      "125:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "126:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "127:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 128f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 129f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "b 129f\n"
+      "128:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "129:"  // Height 4: input setup done
+      "cmp x11, #0x8\n"
+      "blt 132f\n"
+      "cmp x11, #0x10\n"
+      "blt 131f\n"
+      "130:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x11, x11, #0x8\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "cmp x11, #0x10\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      "bge 130b\n"
+      "131:"  // Height 4: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      "132:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x11, 137f\n"
+      "cmp x11, #0x2\n"
+      "blt 134f\n"
+      "133:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x2\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "cmp x11, #0x2\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      "bge 133b\n"
+      "cbz x11, 137f\n"
+      "134:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 135f\n"
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "tbz x11, #0, 136f\n"
+      "ld1 { v0.h }[2], [x10]\n"
+      "ld1 { v1.h }[2], [x28]\n"
+      "ld1 { v2.h }[2], [x26]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "b 136f\n"
+      "135:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h0, [x10, #0x0]\n"
+      "ldr h1, [x28, #0x0]\n"
+      "ldr h2, [x26, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "136:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      "137:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 127b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 138f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "138:"  // Height 4: No activation
+      "cmp x16, #0x10\n"
+      "bge 147f\n"
+      "tbz x16, #3, 142f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "tbz x16, #2, 140f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 139f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "tbz x16, #0, 146f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "b 146f\n"
+      "139:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 146f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "b 146f\n"
+      "140:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 141f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "tbz x16, #0, 146f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "b 146f\n"
+      "141:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 146f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "b 146f\n"
+      "142:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 144f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 143f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "tbz x16, #0, 146f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "b 146f\n"
+      "143:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 146f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "b 146f\n"
+      "144:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 145f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x16, #0, 146f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "b 146f\n"
+      "145:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "146:"  // Height 4: Partial direct writeback: Done
+      "b 148f\n"
+      "147:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "148:"  // Height 4: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 114b\n"
+      "b 224f\n"
+      "149:"  // Height 5
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 150f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 151f\n"
+      "150:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "151:"  // Height 5: Column loop
+      "cbz x14, 152f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v24.16b, v8.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "b 163f\n"
+      "152:"  // Height 5: no bias
+      "tbz %x[flags], #0, 162f\n"
+      "cmp x16, #0x10\n"
+      "bge 161f\n"
+      "tbz x16, #3, 156f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 154f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 153f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "tbz x16, #0, 160f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "b 160f\n"
+      "153:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 160f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "b 160f\n"
+      "154:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x16, #1, 155f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 160f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "b 160f\n"
+      "155:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 160f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "b 160f\n"
+      "156:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x16, #2, 158f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 157f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "tbz x16, #0, 160f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "b 160f\n"
+      "157:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 160f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "b 160f\n"
+      "158:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x16, #1, 159f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 160f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "b 160f\n"
+      "159:"  // Height 5: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "160:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "b 163f\n"
+      "161:"  // Height 5: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "b 163f\n"
+      "162:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "163:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "164:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 165f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 166f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "b 166f\n"
+      "165:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "166:"  // Height 5: input setup done
+      "cmp x11, #0x8\n"
+      "blt 169f\n"
+      "cmp x11, #0x10\n"
+      "blt 168f\n"
+      "167:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x11, x11, #0x8\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "cmp x11, #0x10\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0d8  // bfdot v24.4s, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0f9  // bfdot v25.4s, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0da  // bfdot v26.4s, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0fb  // bfdot v27.4s, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8d8  // bfdot v24.4s, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8f9  // bfdot v25.4s, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8da  // bfdot v26.4s, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8fb  // bfdot v27.4s, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8d8  // bfdot v24.4s, v6.8h, v4.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8f9  // bfdot v25.4s, v7.8h, v4.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8da  // bfdot v26.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
+      "bge 167b\n"
+      "168:"  // Height 5: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0d8  // bfdot v24.4s, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0f9  // bfdot v25.4s, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0da  // bfdot v26.4s, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0fb  // bfdot v27.4s, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8d8  // bfdot v24.4s, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8f9  // bfdot v25.4s, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8da  // bfdot v26.4s, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8fb  // bfdot v27.4s, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8d8  // bfdot v24.4s, v6.8h, v4.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8f9  // bfdot v25.4s, v7.8h, v4.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8da  // bfdot v26.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
+      "169:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x11, 174f\n"
+      "cmp x11, #0x2\n"
+      "blt 171f\n"
+      "170:"  // Height 5: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x2\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "cmp x11, #0x2\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      "bge 170b\n"
+      "cbz x11, 174f\n"
+      "171:"  // Height 5: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 172f\n"
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "tbz x11, #0, 173f\n"
+      "ld1 { v0.h }[2], [x10]\n"
+      "ld1 { v1.h }[2], [x28]\n"
+      "ld1 { v2.h }[2], [x26]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "ld1 { v4.h }[2], [x22]\n"
+      "b 173f\n"
+      "172:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h0, [x10, #0x0]\n"
+      "ldr h1, [x28, #0x0]\n"
+      "ldr h2, [x26, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "ldr h4, [x22, #0x0]\n"
+      "173:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      "174:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 164b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 175f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "175:"  // Height 5: No activation
+      "cmp x16, #0x10\n"
+      "bge 184f\n"
+      "tbz x16, #3, 179f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 177f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 176f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "tbz x16, #0, 183f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "b 183f\n"
+      "176:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 183f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "b 183f\n"
+      "177:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 178f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "tbz x16, #0, 183f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "b 183f\n"
+      "178:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 183f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "b 183f\n"
+      "179:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 181f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 180f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "tbz x16, #0, 183f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "b 183f\n"
+      "180:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 183f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "b 183f\n"
+      "181:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 182f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x16, #0, 183f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "b 183f\n"
+      "182:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "183:"  // Height 5: Partial direct writeback: Done
+      "b 185f\n"
+      "184:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "185:"  // Height 5: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 151b\n"
+      "b 224f\n"
+      "186:"  // Height 6
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 187f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 188f\n"
+      "187:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "188:"  // Height 6: Column loop
+      "cbz x14, 189f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v24.16b, v8.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v28.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v29.16b, v9.16b\n"
+      "mov v30.16b, v10.16b\n"
+      "mov v31.16b, v11.16b\n"
+      "b 200f\n"
+      "189:"  // Height 6: no bias
+      "tbz %x[flags], #0, 199f\n"
+      "cmp x16, #0x10\n"
+      "bge 198f\n"
+      "tbz x16, #3, 193f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 191f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 190f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x16, #0, 197f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 197f\n"
+      "190:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 197f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 197f\n"
+      "191:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x16, #1, 192f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 197f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 197f\n"
+      "192:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 197f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 197f\n"
+      "193:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x16, #2, 195f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 194f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x16, #0, 197f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 197f\n"
+      "194:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 197f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 197f\n"
+      "195:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x16, #1, 196f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 197f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 197f\n"
+      "196:"  // Height 6: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "197:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "sub x21, x21, x19\n"
+      "b 200f\n"
+      "198:"  // Height 6: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "b 200f\n"
+      "199:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "200:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "201:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 202f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 203f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x20, x20, x19, LSL #1\n"
+      "b 203f\n"
+      "202:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "add x20, x22, x19, LSL #1\n"
+      "203:"  // Height 6: input setup done
+      "cmp x11, #0x8\n"
+      "blt 206f\n"
+      "cmp x11, #0x10\n"
+      "blt 205f\n"
+      "204:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x11, x11, #0x8\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "cmp x11, #0x10\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0ff  // bfdot v31.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0d8  // bfdot v24.4s, v6.8h, v4.h[1]\n"
+      ".inst 0x4f65f0dc  // bfdot v28.4s, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0f9  // bfdot v25.4s, v7.8h, v4.h[1]\n"
+      ".inst 0x4f65f0fd  // bfdot v29.4s, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0da  // bfdot v26.4s, v6.8h, v4.h[1]\n"
+      ".inst 0x4f65f0de  // bfdot v30.4s, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0fb  // bfdot v27.4s, v7.8h, v4.h[1]\n"
+      ".inst 0x4f65f0ff  // bfdot v31.4s, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8d8  // bfdot v24.4s, v6.8h, v4.h[2]\n"
+      ".inst 0x4f45f8dc  // bfdot v28.4s, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8f9  // bfdot v25.4s, v7.8h, v4.h[2]\n"
+      ".inst 0x4f45f8fd  // bfdot v29.4s, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8da  // bfdot v26.4s, v6.8h, v4.h[2]\n"
+      ".inst 0x4f45f8de  // bfdot v30.4s, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8fb  // bfdot v27.4s, v7.8h, v4.h[2]\n"
+      ".inst 0x4f45f8ff  // bfdot v31.4s, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8d8  // bfdot v24.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f65f8dc  // bfdot v28.4s, v6.8h, v5.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8f9  // bfdot v25.4s, v7.8h, v4.h[3]\n"
+      ".inst 0x4f65f8fd  // bfdot v29.4s, v7.8h, v5.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8da  // bfdot v26.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f65f8de  // bfdot v30.4s, v6.8h, v5.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
+      ".inst 0x4f65f8ff  // bfdot v31.4s, v7.8h, v5.h[3]\n"
+      "bge 204b\n"
+      "205:"  // Height 6: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0ff  // bfdot v31.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0d8  // bfdot v24.4s, v6.8h, v4.h[1]\n"
+      ".inst 0x4f65f0dc  // bfdot v28.4s, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0f9  // bfdot v25.4s, v7.8h, v4.h[1]\n"
+      ".inst 0x4f65f0fd  // bfdot v29.4s, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0da  // bfdot v26.4s, v6.8h, v4.h[1]\n"
+      ".inst 0x4f65f0de  // bfdot v30.4s, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0fb  // bfdot v27.4s, v7.8h, v4.h[1]\n"
+      ".inst 0x4f65f0ff  // bfdot v31.4s, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8d8  // bfdot v24.4s, v6.8h, v4.h[2]\n"
+      ".inst 0x4f45f8dc  // bfdot v28.4s, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8f9  // bfdot v25.4s, v7.8h, v4.h[2]\n"
+      ".inst 0x4f45f8fd  // bfdot v29.4s, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8da  // bfdot v26.4s, v6.8h, v4.h[2]\n"
+      ".inst 0x4f45f8de  // bfdot v30.4s, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8fb  // bfdot v27.4s, v7.8h, v4.h[2]\n"
+      ".inst 0x4f45f8ff  // bfdot v31.4s, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8d8  // bfdot v24.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f65f8dc  // bfdot v28.4s, v6.8h, v5.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8f9  // bfdot v25.4s, v7.8h, v4.h[3]\n"
+      ".inst 0x4f65f8fd  // bfdot v29.4s, v7.8h, v5.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8da  // bfdot v26.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f65f8de  // bfdot v30.4s, v6.8h, v5.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
+      ".inst 0x4f65f8ff  // bfdot v31.4s, v7.8h, v5.h[3]\n"
+      "206:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x11, 211f\n"
+      "cmp x11, #0x2\n"
+      "blt 208f\n"
+      "207:"  // Height 6: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x20], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x2\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "cmp x11, #0x2\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0ff  // bfdot v31.4s, v7.8h, v5.h[0]\n"
+      "bge 207b\n"
+      "cbz x11, 211f\n"
+      "208:"  // Height 6: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 209f\n"
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x20], #0x4\n"
+      "tbz x11, #0, 210f\n"
+      "ld1 { v0.h }[2], [x10]\n"
+      "ld1 { v1.h }[2], [x28]\n"
+      "ld1 { v2.h }[2], [x26]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "ld1 { v4.h }[2], [x22]\n"
+      "ld1 { v5.h }[2], [x20]\n"
+      "b 210f\n"
+      "209:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h0, [x10, #0x0]\n"
+      "ldr h1, [x28, #0x0]\n"
+      "ldr h2, [x26, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "ldr h4, [x22, #0x0]\n"
+      "ldr h5, [x20, #0x0]\n"
+      "210:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0ff  // bfdot v31.4s, v7.8h, v5.h[0]\n"
+      "211:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 201b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 212f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmin v28.4s, v28.4s, v0.4s\n"
+      "fmin v29.4s, v29.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "fmax v28.4s, v28.4s, v1.4s\n"
+      "fmax v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v0.4s\n"
+      "fmin v31.4s, v31.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v1.4s\n"
+      "fmax v31.4s, v31.4s, v1.4s\n"
+      "212:"  // Height 6: No activation
+      "cmp x16, #0x10\n"
+      "bge 221f\n"
+      "tbz x16, #3, 216f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 214f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 213f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x16, #0, 220f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 220f\n"
+      "213:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 220f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "b 220f\n"
+      "214:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 215f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x16, #0, 220f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "b 220f\n"
+      "215:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 220f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "b 220f\n"
+      "216:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 218f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 217f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x16, #0, 220f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "b 220f\n"
+      "217:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 220f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "b 220f\n"
+      "218:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 219f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x16, #0, 220f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "b 220f\n"
+      "219:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "220:"  // Height 6: Partial direct writeback: Done
+      "b 222f\n"
+      "221:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "add x21, x21, #0x40\n"
+      "222:"  // Height 6: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 188b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 224f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 223f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "223:"  // Update direct input
+      "mov x19, #0xc\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "224:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
new file mode 100644
index 0000000000..46de98504e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<__fp16>, \
+   size_t, size_t, \
+   const __fp16 *, \
+   IndirectOutputArg<__fp16>, \
+   const __fp16 *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_fp16_mla_6x32( ARGLIST );
+
+class cls_a64_hybrid_fp16_mla_6x32
+{
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 32;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 6, 32, 1> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_fp16_mla_6x32;
+
+    cls_a64_hybrid_fp16_mla_6x32(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
new file mode 100644
index 0000000000..ff6cbec200
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
@@ -0,0 +1,5400 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp16_mla_6x32 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+    size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg,
+    const __fp16 *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const __fp16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+      ".arch  armv8.2-a+fp16\n"
+#endif
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 251f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 201f\n"
+      "beq 151f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 101f\n"
+      "beq 51f\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "cbz x14, 4f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "add x14, x14, #0x40\n"
+      "b 23f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 22f\n"
+      "cmp x16, #0x20\n"
+      "bge 21f\n"
+      "tbz x16, #4, 12f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "tbz x16, #3, 8f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "tbz x16, #2, 6f\n"
+      "ldr d11, [x13], #0x8\n"
+      "tbz x16, #1, 5f\n"
+      "mov x19, #0x3c\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "b 20f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_28
+      "mov x19, #0x38\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "b 20f\n"
+      "6:"  // Height 1: Partial accumulate: partial_2_24
+      "tbz x16, #1, 7f\n"
+      "ldr s11, [x13], #0x4\n"
+      "mov x19, #0x34\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "b 20f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_24
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 20f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "b 20f\n"
+      "8:"  // Height 1: Partial accumulate: partial_4_16
+      "tbz x16, #2, 10f\n"
+      "ldr d10, [x13], #0x8\n"
+      "tbz x16, #1, 9f\n"
+      "mov x19, #0x2c\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "b 20f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_20
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "b 20f\n"
+      "10:"  // Height 1: Partial accumulate: partial_2_16
+      "tbz x16, #1, 11f\n"
+      "ldr s10, [x13], #0x4\n"
+      "mov x19, #0x24\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "b 20f\n"
+      "11:"  // Height 1: Partial accumulate: partial_1_16
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 20f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "b 20f\n"
+      "12:"  // Height 1: Partial accumulate: partial_8_0
+      "tbz x16, #3, 16f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "tbz x16, #2, 14f\n"
+      "ldr d9, [x13], #0x8\n"
+      "tbz x16, #1, 13f\n"
+      "mov x19, #0x1c\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "b 20f\n"
+      "13:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x18\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "b 20f\n"
+      "14:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x16, #1, 15f\n"
+      "ldr s9, [x13], #0x4\n"
+      "mov x19, #0x14\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "b 20f\n"
+      "15:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 20f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "b 20f\n"
+      "16:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x16, #2, 18f\n"
+      "ldr d8, [x13], #0x8\n"
+      "tbz x16, #1, 17f\n"
+      "mov x19, #0xc\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "b 20f\n"
+      "17:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "b 20f\n"
+      "18:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x16, #1, 19f\n"
+      "ldr s8, [x13], #0x4\n"
+      "mov x19, #0x4\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "b 20f\n"
+      "19:"  // Height 1: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr h8, [x13, #0x0]\n"
+      "20:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 23f\n"
+      "21:"  // Height 1: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "b 23f\n"
+      "22:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "23:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "24:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 25f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 26f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "b 26f\n"
+      "25:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "26:"  // Height 1: input setup done
+      "cmp x11, #0x8\n"
+      "blt 29f\n"
+      "cmp x11, #0x10\n"
+      "blt 28f\n"
+      "27:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "sub x11, x11, #0x8\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "cmp x11, #0x10\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "bge 27b\n"
+      "28:"  // Height 1: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "29:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x11, 31f\n"
+      "30:"  // Height 1: Multiply loop: Odd block loop
+      "ldr h0, [x10], #0x2\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "sub x11, x11, #0x1\n"
+      "add x15, x15, #0x40\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "cbnz x11, 30b\n"
+      "31:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 24b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 32f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.8h }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.8h }, [x19]\n"
+      "fmin v8.8h, v8.8h, v0.8h\n"
+      "fmin v9.8h, v9.8h, v0.8h\n"
+      "fmin v10.8h, v10.8h, v0.8h\n"
+      "fmin v11.8h, v11.8h, v0.8h\n"
+      "fmax v8.8h, v8.8h, v1.8h\n"
+      "fmax v9.8h, v9.8h, v1.8h\n"
+      "fmax v10.8h, v10.8h, v1.8h\n"
+      "fmax v11.8h, v11.8h, v1.8h\n"
+      "32:"  // Height 1: No activation
+      "cmp x16, #0x20\n"
+      "bge 49f\n"
+      "tbz x16, #4, 40f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "tbz x16, #3, 36f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "tbz x16, #2, 34f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x16, #1, 33f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "b 48f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_1_28
+      "tbz x16, #0, 48f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "b 48f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_2_24
+      "tbz x16, #1, 35f\n"
+      "str s11, [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "b 48f\n"
+      "35:"  // Height 1: Partial direct writeback: partial_1_24
+      "tbz x16, #0, 48f\n"
+      "str h11, [x13, #0x0]\n"
+      "b 48f\n"
+      "36:"  // Height 1: Partial direct writeback: partial_4_16
+      "tbz x16, #2, 38f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x16, #1, 37f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "b 48f\n"
+      "37:"  // Height 1: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 48f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "b 48f\n"
+      "38:"  // Height 1: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 39f\n"
+      "str s10, [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "b 48f\n"
+      "39:"  // Height 1: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 48f\n"
+      "str h10, [x13, #0x0]\n"
+      "b 48f\n"
+      "40:"  // Height 1: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 44f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "tbz x16, #2, 42f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x16, #1, 41f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "b 48f\n"
+      "41:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 48f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "b 48f\n"
+      "42:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 43f\n"
+      "str s9, [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "b 48f\n"
+      "43:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 48f\n"
+      "str h9, [x13, #0x0]\n"
+      "b 48f\n"
+      "44:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 46f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x16, #1, 45f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "b 48f\n"
+      "45:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 48f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "b 48f\n"
+      "46:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 47f\n"
+      "str s8, [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "b 48f\n"
+      "47:"  // Height 1: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "48:"  // Height 1: Partial direct writeback: Done
+      "b 50f\n"
+      "49:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "50:"  // Height 1: Writeback done
+      "subs x16, x16, #0x20\n"
+      "bgt 3b\n"
+      "b 302f\n"
+      "51:"  // Height 2
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 52f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "b 53f\n"
+      "52:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "53:"  // Height 2: Column loop
+      "cbz x14, 54f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v15.16b, v11.16b\n"
+      "b 73f\n"
+      "54:"  // Height 2: no bias
+      "tbz %x[flags], #0, 72f\n"
+      "cmp x16, #0x20\n"
+      "bge 71f\n"
+      "tbz x16, #4, 62f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x9], #0x10\n"
+      "tbz x16, #3, 58f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x9], #0x10\n"
+      "tbz x16, #2, 56f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "tbz x16, #1, 55f\n"
+      "mov x19, #0x3c\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x9]\n"
+      "b 70f\n"
+      "55:"  // Height 2: Partial accumulate: partial_1_28
+      "mov x19, #0x38\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x9]\n"
+      "b 70f\n"
+      "56:"  // Height 2: Partial accumulate: partial_2_24
+      "tbz x16, #1, 57f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x9], #0x4\n"
+      "mov x19, #0x34\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x9]\n"
+      "b 70f\n"
+      "57:"  // Height 2: Partial accumulate: partial_1_24
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 70f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x9, #0x0]\n"
+      "b 70f\n"
+      "58:"  // Height 2: Partial accumulate: partial_4_16
+      "tbz x16, #2, 60f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "tbz x16, #1, 59f\n"
+      "mov x19, #0x2c\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x9]\n"
+      "b 70f\n"
+      "59:"  // Height 2: Partial accumulate: partial_1_20
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x9]\n"
+      "b 70f\n"
+      "60:"  // Height 2: Partial accumulate: partial_2_16
+      "tbz x16, #1, 61f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x9], #0x4\n"
+      "mov x19, #0x24\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x9]\n"
+      "b 70f\n"
+      "61:"  // Height 2: Partial accumulate: partial_1_16
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 70f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x9, #0x0]\n"
+      "b 70f\n"
+      "62:"  // Height 2: Partial accumulate: partial_8_0
+      "tbz x16, #3, 66f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "tbz x16, #2, 64f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "tbz x16, #1, 63f\n"
+      "mov x19, #0x1c\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x9]\n"
+      "b 70f\n"
+      "63:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x18\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x9]\n"
+      "b 70f\n"
+      "64:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x16, #1, 65f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x9], #0x4\n"
+      "mov x19, #0x14\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x9]\n"
+      "b 70f\n"
+      "65:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 70f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x9, #0x0]\n"
+      "b 70f\n"
+      "66:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x16, #2, 68f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "tbz x16, #1, 67f\n"
+      "mov x19, #0xc\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x9]\n"
+      "b 70f\n"
+      "67:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x9]\n"
+      "b 70f\n"
+      "68:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x16, #1, 69f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x9], #0x4\n"
+      "mov x19, #0x4\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x9]\n"
+      "b 70f\n"
+      "69:"  // Height 2: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x9, #0x0]\n"
+      "70:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "b 73f\n"
+      "71:"  // Height 2: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "b 73f\n"
+      "72:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "73:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "74:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 75f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 76f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "b 76f\n"
+      "75:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "76:"  // Height 2: input setup done
+      "cmp x11, #0x8\n"
+      "blt 79f\n"
+      "cmp x11, #0x10\n"
+      "blt 78f\n"
+      "77:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "sub x11, x11, #0x8\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "cmp x11, #0x10\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "bge 77b\n"
+      "78:"  // Height 2: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "79:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x11, 81f\n"
+      "80:"  // Height 2: Multiply loop: Odd block loop
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "cbnz x11, 80b\n"
+      "81:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 74b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 82f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.8h }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.8h }, [x19]\n"
+      "fmin v8.8h, v8.8h, v0.8h\n"
+      "fmin v9.8h, v9.8h, v0.8h\n"
+      "fmin v10.8h, v10.8h, v0.8h\n"
+      "fmin v11.8h, v11.8h, v0.8h\n"
+      "fmax v8.8h, v8.8h, v1.8h\n"
+      "fmax v9.8h, v9.8h, v1.8h\n"
+      "fmax v10.8h, v10.8h, v1.8h\n"
+      "fmax v11.8h, v11.8h, v1.8h\n"
+      "fmin v12.8h, v12.8h, v0.8h\n"
+      "fmin v13.8h, v13.8h, v0.8h\n"
+      "fmin v14.8h, v14.8h, v0.8h\n"
+      "fmax v12.8h, v12.8h, v1.8h\n"
+      "fmax v13.8h, v13.8h, v1.8h\n"
+      "fmax v14.8h, v14.8h, v1.8h\n"
+      "fmin v15.8h, v15.8h, v0.8h\n"
+      "fmax v15.8h, v15.8h, v1.8h\n"
+      "82:"  // Height 2: No activation
+      "cmp x16, #0x20\n"
+      "bge 99f\n"
+      "tbz x16, #4, 90f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v13.8h }, [x9], #0x10\n"
+      "tbz x16, #3, 86f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x9], #0x10\n"
+      "tbz x16, #2, 84f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "tbz x16, #1, 83f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x9]\n"
+      "b 98f\n"
+      "83:"  // Height 2: Partial direct writeback: partial_1_28
+      "tbz x16, #0, 98f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x9]\n"
+      "b 98f\n"
+      "84:"  // Height 2: Partial direct writeback: partial_2_24
+      "tbz x16, #1, 85f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x9]\n"
+      "b 98f\n"
+      "85:"  // Height 2: Partial direct writeback: partial_1_24
+      "tbz x16, #0, 98f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x9, #0x0]\n"
+      "b 98f\n"
+      "86:"  // Height 2: Partial direct writeback: partial_4_16
+      "tbz x16, #2, 88f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "tbz x16, #1, 87f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x9]\n"
+      "b 98f\n"
+      "87:"  // Height 2: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 98f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x9]\n"
+      "b 98f\n"
+      "88:"  // Height 2: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 89f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x9]\n"
+      "b 98f\n"
+      "89:"  // Height 2: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 98f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x9, #0x0]\n"
+      "b 98f\n"
+      "90:"  // Height 2: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 94f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "tbz x16, #2, 92f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "tbz x16, #1, 91f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x9]\n"
+      "b 98f\n"
+      "91:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 98f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x9]\n"
+      "b 98f\n"
+      "92:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 93f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x9]\n"
+      "b 98f\n"
+      "93:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 98f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x9, #0x0]\n"
+      "b 98f\n"
+      "94:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 96f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "tbz x16, #1, 95f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x9]\n"
+      "b 98f\n"
+      "95:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 98f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x9]\n"
+      "b 98f\n"
+      "96:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 97f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x9]\n"
+      "b 98f\n"
+      "97:"  // Height 2: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x9, #0x0]\n"
+      "98:"  // Height 2: Partial direct writeback: Done
+      "b 100f\n"
+      "99:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "100:"  // Height 2: Writeback done
+      "subs x16, x16, #0x20\n"
+      "bgt 53b\n"
+      "b 302f\n"
+      "101:"  // Height 3
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 102f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "add x27, x27, x19, LSL #1\n"
+      "b 103f\n"
+      "102:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "103:"  // Height 3: Column loop
+      "cbz x14, 104f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v13.16b, v9.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "b 123f\n"
+      "104:"  // Height 3: no bias
+      "tbz %x[flags], #0, 122f\n"
+      "cmp x16, #0x20\n"
+      "bge 121f\n"
+      "tbz x16, #4, 112f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x9], #0x10\n"
+      "ld1 { v17.8h }, [x27], #0x10\n"
+      "tbz x16, #3, 108f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x9], #0x10\n"
+      "ld1 { v18.8h }, [x27], #0x10\n"
+      "tbz x16, #2, 106f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "tbz x16, #1, 105f\n"
+      "mov x19, #0x3c\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x9], #0x4\n"
+      "ld1 { v19.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x9]\n"
+      "ld1 { v19.h }[6], [x27]\n"
+      "b 120f\n"
+      "105:"  // Height 3: Partial accumulate: partial_1_28
+      "mov x19, #0x38\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x9]\n"
+      "ld1 { v19.h }[4], [x27]\n"
+      "b 120f\n"
+      "106:"  // Height 3: Partial accumulate: partial_2_24
+      "tbz x16, #1, 107f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x9], #0x4\n"
+      "ldr s19, [x27], #0x4\n"
+      "mov x19, #0x34\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x9]\n"
+      "ld1 { v19.h }[2], [x27]\n"
+      "b 120f\n"
+      "107:"  // Height 3: Partial accumulate: partial_1_24
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 120f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x9, #0x0]\n"
+      "ldr h19, [x27, #0x0]\n"
+      "b 120f\n"
+      "108:"  // Height 3: Partial accumulate: partial_4_16
+      "tbz x16, #2, 110f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "tbz x16, #1, 109f\n"
+      "mov x19, #0x2c\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x9], #0x4\n"
+      "ld1 { v18.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x9]\n"
+      "ld1 { v18.h }[6], [x27]\n"
+      "b 120f\n"
+      "109:"  // Height 3: Partial accumulate: partial_1_20
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x9]\n"
+      "ld1 { v18.h }[4], [x27]\n"
+      "b 120f\n"
+      "110:"  // Height 3: Partial accumulate: partial_2_16
+      "tbz x16, #1, 111f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x9], #0x4\n"
+      "ldr s18, [x27], #0x4\n"
+      "mov x19, #0x24\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x9]\n"
+      "ld1 { v18.h }[2], [x27]\n"
+      "b 120f\n"
+      "111:"  // Height 3: Partial accumulate: partial_1_16
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 120f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x9, #0x0]\n"
+      "ldr h18, [x27, #0x0]\n"
+      "b 120f\n"
+      "112:"  // Height 3: Partial accumulate: partial_8_0
+      "tbz x16, #3, 116f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "tbz x16, #2, 114f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "tbz x16, #1, 113f\n"
+      "mov x19, #0x1c\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x9], #0x4\n"
+      "ld1 { v17.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x9]\n"
+      "ld1 { v17.h }[6], [x27]\n"
+      "b 120f\n"
+      "113:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x18\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x9]\n"
+      "ld1 { v17.h }[4], [x27]\n"
+      "b 120f\n"
+      "114:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x16, #1, 115f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x9], #0x4\n"
+      "ldr s17, [x27], #0x4\n"
+      "mov x19, #0x14\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x9]\n"
+      "ld1 { v17.h }[2], [x27]\n"
+      "b 120f\n"
+      "115:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 120f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x9, #0x0]\n"
+      "ldr h17, [x27, #0x0]\n"
+      "b 120f\n"
+      "116:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x16, #2, 118f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "tbz x16, #1, 117f\n"
+      "mov x19, #0xc\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x9], #0x4\n"
+      "ld1 { v16.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x9]\n"
+      "ld1 { v16.h }[6], [x27]\n"
+      "b 120f\n"
+      "117:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x9]\n"
+      "ld1 { v16.h }[4], [x27]\n"
+      "b 120f\n"
+      "118:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x16, #1, 119f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x9], #0x4\n"
+      "ldr s16, [x27], #0x4\n"
+      "mov x19, #0x4\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x9]\n"
+      "ld1 { v16.h }[2], [x27]\n"
+      "b 120f\n"
+      "119:"  // Height 3: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x9, #0x0]\n"
+      "ldr h16, [x27, #0x0]\n"
+      "120:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "b 123f\n"
+      "121:"  // Height 3: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "b 123f\n"
+      "122:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "123:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "124:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 125f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 126f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "b 126f\n"
+      "125:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "126:"  // Height 3: input setup done
+      "cmp x11, #0x8\n"
+      "blt 129f\n"
+      "cmp x11, #0x10\n"
+      "blt 128f\n"
+      "127:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "sub x11, x11, #0x8\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "cmp x11, #0x10\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "bge 127b\n"
+      "128:"  // Height 3: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "129:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x11, 131f\n"
+      "130:"  // Height 3: Multiply loop: Odd block loop
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "cbnz x11, 130b\n"
+      "131:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 124b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 132f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.8h }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.8h }, [x19]\n"
+      "fmin v8.8h, v8.8h, v0.8h\n"
+      "fmin v9.8h, v9.8h, v0.8h\n"
+      "fmin v10.8h, v10.8h, v0.8h\n"
+      "fmin v11.8h, v11.8h, v0.8h\n"
+      "fmax v8.8h, v8.8h, v1.8h\n"
+      "fmax v9.8h, v9.8h, v1.8h\n"
+      "fmax v10.8h, v10.8h, v1.8h\n"
+      "fmax v11.8h, v11.8h, v1.8h\n"
+      "fmin v12.8h, v12.8h, v0.8h\n"
+      "fmin v13.8h, v13.8h, v0.8h\n"
+      "fmin v14.8h, v14.8h, v0.8h\n"
+      "fmax v12.8h, v12.8h, v1.8h\n"
+      "fmax v13.8h, v13.8h, v1.8h\n"
+      "fmax v14.8h, v14.8h, v1.8h\n"
+      "fmin v15.8h, v15.8h, v0.8h\n"
+      "fmin v16.8h, v16.8h, v0.8h\n"
+      "fmin v17.8h, v17.8h, v0.8h\n"
+      "fmax v15.8h, v15.8h, v1.8h\n"
+      "fmax v16.8h, v16.8h, v1.8h\n"
+      "fmax v17.8h, v17.8h, v1.8h\n"
+      "fmin v18.8h, v18.8h, v0.8h\n"
+      "fmin v19.8h, v19.8h, v0.8h\n"
+      "fmax v18.8h, v18.8h, v1.8h\n"
+      "fmax v19.8h, v19.8h, v1.8h\n"
+      "132:"  // Height 3: No activation
+      "cmp x16, #0x20\n"
+      "bge 149f\n"
+      "tbz x16, #4, 140f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v13.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v17.8h }, [x27], #0x10\n"
+      "tbz x16, #3, 136f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x9], #0x10\n"
+      "st1 { v18.8h }, [x27], #0x10\n"
+      "tbz x16, #2, 134f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "tbz x16, #1, 133f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x9], #0x4\n"
+      "st1 { v19.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x9]\n"
+      "st1 { v19.h }[6], [x27]\n"
+      "b 148f\n"
+      "133:"  // Height 3: Partial direct writeback: partial_1_28
+      "tbz x16, #0, 148f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x9]\n"
+      "st1 { v19.h }[4], [x27]\n"
+      "b 148f\n"
+      "134:"  // Height 3: Partial direct writeback: partial_2_24
+      "tbz x16, #1, 135f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x9], #0x4\n"
+      "str s19, [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x9]\n"
+      "st1 { v19.h }[2], [x27]\n"
+      "b 148f\n"
+      "135:"  // Height 3: Partial direct writeback: partial_1_24
+      "tbz x16, #0, 148f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x9, #0x0]\n"
+      "str h19, [x27, #0x0]\n"
+      "b 148f\n"
+      "136:"  // Height 3: Partial direct writeback: partial_4_16
+      "tbz x16, #2, 138f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "tbz x16, #1, 137f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x9], #0x4\n"
+      "st1 { v18.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x9]\n"
+      "st1 { v18.h }[6], [x27]\n"
+      "b 148f\n"
+      "137:"  // Height 3: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 148f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x9]\n"
+      "st1 { v18.h }[4], [x27]\n"
+      "b 148f\n"
+      "138:"  // Height 3: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 139f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x9], #0x4\n"
+      "str s18, [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x9]\n"
+      "st1 { v18.h }[2], [x27]\n"
+      "b 148f\n"
+      "139:"  // Height 3: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 148f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x9, #0x0]\n"
+      "str h18, [x27, #0x0]\n"
+      "b 148f\n"
+      "140:"  // Height 3: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 144f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "tbz x16, #2, 142f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "tbz x16, #1, 141f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x9], #0x4\n"
+      "st1 { v17.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x9]\n"
+      "st1 { v17.h }[6], [x27]\n"
+      "b 148f\n"
+      "141:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 148f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x9]\n"
+      "st1 { v17.h }[4], [x27]\n"
+      "b 148f\n"
+      "142:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 143f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x9], #0x4\n"
+      "str s17, [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x9]\n"
+      "st1 { v17.h }[2], [x27]\n"
+      "b 148f\n"
+      "143:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 148f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x9, #0x0]\n"
+      "str h17, [x27, #0x0]\n"
+      "b 148f\n"
+      "144:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 146f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "tbz x16, #1, 145f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x9]\n"
+      "st1 { v16.h }[6], [x27]\n"
+      "b 148f\n"
+      "145:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 148f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x9]\n"
+      "st1 { v16.h }[4], [x27]\n"
+      "b 148f\n"
+      "146:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 147f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x9]\n"
+      "st1 { v16.h }[2], [x27]\n"
+      "b 148f\n"
+      "147:"  // Height 3: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x9, #0x0]\n"
+      "str h16, [x27, #0x0]\n"
+      "148:"  // Height 3: Partial direct writeback: Done
+      "b 150f\n"
+      "149:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "150:"  // Height 3: Writeback done
+      "subs x16, x16, #0x20\n"
+      "bgt 103b\n"
+      "b 302f\n"
+      "151:"  // Height 4
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 152f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #1\n"
+      "add x25, x25, x19, LSL #1\n"
+      "b 153f\n"
+      "152:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "add x25, x27, x19, LSL #1\n"
+      "153:"  // Height 4: Column loop
+      "cbz x14, 154f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "add x14, x14, #0x40\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "b 173f\n"
+      "154:"  // Height 4: no bias
+      "tbz %x[flags], #0, 172f\n"
+      "cmp x16, #0x20\n"
+      "bge 171f\n"
+      "tbz x16, #4, 162f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v20.8h }, [x25], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x9], #0x10\n"
+      "ld1 { v17.8h }, [x27], #0x10\n"
+      "ld1 { v21.8h }, [x25], #0x10\n"
+      "tbz x16, #3, 158f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x9], #0x10\n"
+      "ld1 { v18.8h }, [x27], #0x10\n"
+      "ld1 { v22.8h }, [x25], #0x10\n"
+      "tbz x16, #2, 156f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "tbz x16, #1, 155f\n"
+      "mov x19, #0x3c\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x9], #0x4\n"
+      "ld1 { v19.s }[2], [x27], #0x4\n"
+      "ld1 { v23.s }[2], [x25], #0x4\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x9]\n"
+      "ld1 { v19.h }[6], [x27]\n"
+      "ld1 { v23.h }[6], [x25]\n"
+      "b 170f\n"
+      "155:"  // Height 4: Partial accumulate: partial_1_28
+      "mov x19, #0x38\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x9]\n"
+      "ld1 { v19.h }[4], [x27]\n"
+      "ld1 { v23.h }[4], [x25]\n"
+      "b 170f\n"
+      "156:"  // Height 4: Partial accumulate: partial_2_24
+      "tbz x16, #1, 157f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x9], #0x4\n"
+      "ldr s19, [x27], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
+      "mov x19, #0x34\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x9]\n"
+      "ld1 { v19.h }[2], [x27]\n"
+      "ld1 { v23.h }[2], [x25]\n"
+      "b 170f\n"
+      "157:"  // Height 4: Partial accumulate: partial_1_24
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 170f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x9, #0x0]\n"
+      "ldr h19, [x27, #0x0]\n"
+      "ldr h23, [x25, #0x0]\n"
+      "b 170f\n"
+      "158:"  // Height 4: Partial accumulate: partial_4_16
+      "tbz x16, #2, 160f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "tbz x16, #1, 159f\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x9], #0x4\n"
+      "ld1 { v18.s }[2], [x27], #0x4\n"
+      "ld1 { v22.s }[2], [x25], #0x4\n"
+      "mov x19, #0x2c\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x9]\n"
+      "ld1 { v18.h }[6], [x27]\n"
+      "ld1 { v22.h }[6], [x25]\n"
+      "b 170f\n"
+      "159:"  // Height 4: Partial accumulate: partial_1_20
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x9]\n"
+      "ld1 { v18.h }[4], [x27]\n"
+      "ld1 { v22.h }[4], [x25]\n"
+      "b 170f\n"
+      "160:"  // Height 4: Partial accumulate: partial_2_16
+      "tbz x16, #1, 161f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x9], #0x4\n"
+      "ldr s18, [x27], #0x4\n"
+      "ldr s22, [x25], #0x4\n"
+      "mov x19, #0x24\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x9]\n"
+      "ld1 { v18.h }[2], [x27]\n"
+      "ld1 { v22.h }[2], [x25]\n"
+      "b 170f\n"
+      "161:"  // Height 4: Partial accumulate: partial_1_16
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 170f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x9, #0x0]\n"
+      "ldr h18, [x27, #0x0]\n"
+      "ldr h22, [x25, #0x0]\n"
+      "b 170f\n"
+      "162:"  // Height 4: Partial accumulate: partial_8_0
+      "tbz x16, #3, 166f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v20.8h }, [x25], #0x10\n"
+      "tbz x16, #2, 164f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "tbz x16, #1, 163f\n"
+      "mov x19, #0x1c\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x9], #0x4\n"
+      "ld1 { v17.s }[2], [x27], #0x4\n"
+      "ld1 { v21.s }[2], [x25], #0x4\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x9]\n"
+      "ld1 { v17.h }[6], [x27]\n"
+      "ld1 { v21.h }[6], [x25]\n"
+      "b 170f\n"
+      "163:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x18\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x9]\n"
+      "ld1 { v17.h }[4], [x27]\n"
+      "ld1 { v21.h }[4], [x25]\n"
+      "b 170f\n"
+      "164:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x16, #1, 165f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x9], #0x4\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s21, [x25], #0x4\n"
+      "mov x19, #0x14\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x9]\n"
+      "ld1 { v17.h }[2], [x27]\n"
+      "ld1 { v21.h }[2], [x25]\n"
+      "b 170f\n"
+      "165:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 170f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x9, #0x0]\n"
+      "ldr h17, [x27, #0x0]\n"
+      "ldr h21, [x25, #0x0]\n"
+      "b 170f\n"
+      "166:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x16, #2, 168f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "tbz x16, #1, 167f\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x9], #0x4\n"
+      "ld1 { v16.s }[2], [x27], #0x4\n"
+      "ld1 { v20.s }[2], [x25], #0x4\n"
+      "mov x19, #0xc\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x9]\n"
+      "ld1 { v16.h }[6], [x27]\n"
+      "ld1 { v20.h }[6], [x25]\n"
+      "b 170f\n"
+      "167:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x9]\n"
+      "ld1 { v16.h }[4], [x27]\n"
+      "ld1 { v20.h }[4], [x25]\n"
+      "b 170f\n"
+      "168:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x16, #1, 169f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x9], #0x4\n"
+      "ldr s16, [x27], #0x4\n"
+      "ldr s20, [x25], #0x4\n"
+      "mov x19, #0x4\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x9]\n"
+      "ld1 { v16.h }[2], [x27]\n"
+      "ld1 { v20.h }[2], [x25]\n"
+      "b 170f\n"
+      "169:"  // Height 4: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x9, #0x0]\n"
+      "ldr h16, [x27, #0x0]\n"
+      "ldr h20, [x25, #0x0]\n"
+      "170:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "b 173f\n"
+      "171:"  // Height 4: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "b 173f\n"
+      "172:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "173:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "174:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 175f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 176f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "b 176f\n"
+      "175:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "176:"  // Height 4: input setup done
+      "cmp x11, #0x8\n"
+      "blt 179f\n"
+      "cmp x11, #0x10\n"
+      "blt 178f\n"
+      "177:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x11, x11, #0x8\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "cmp x11, #0x10\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "bge 177b\n"
+      "178:"  // Height 4: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "179:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x11, 181f\n"
+      "180:"  // Height 4: Multiply loop: Odd block loop
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "cbnz x11, 180b\n"
+      "181:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 174b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 182f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.8h }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.8h }, [x19]\n"
+      "fmin v8.8h, v8.8h, v0.8h\n"
+      "fmin v9.8h, v9.8h, v0.8h\n"
+      "fmin v10.8h, v10.8h, v0.8h\n"
+      "fmin v11.8h, v11.8h, v0.8h\n"
+      "fmax v8.8h, v8.8h, v1.8h\n"
+      "fmax v9.8h, v9.8h, v1.8h\n"
+      "fmax v10.8h, v10.8h, v1.8h\n"
+      "fmax v11.8h, v11.8h, v1.8h\n"
+      "fmin v12.8h, v12.8h, v0.8h\n"
+      "fmin v13.8h, v13.8h, v0.8h\n"
+      "fmin v14.8h, v14.8h, v0.8h\n"
+      "fmax v12.8h, v12.8h, v1.8h\n"
+      "fmax v13.8h, v13.8h, v1.8h\n"
+      "fmax v14.8h, v14.8h, v1.8h\n"
+      "fmin v15.8h, v15.8h, v0.8h\n"
+      "fmin v16.8h, v16.8h, v0.8h\n"
+      "fmin v17.8h, v17.8h, v0.8h\n"
+      "fmax v15.8h, v15.8h, v1.8h\n"
+      "fmax v16.8h, v16.8h, v1.8h\n"
+      "fmax v17.8h, v17.8h, v1.8h\n"
+      "fmin v18.8h, v18.8h, v0.8h\n"
+      "fmin v19.8h, v19.8h, v0.8h\n"
+      "fmin v20.8h, v20.8h, v0.8h\n"
+      "fmax v18.8h, v18.8h, v1.8h\n"
+      "fmax v19.8h, v19.8h, v1.8h\n"
+      "fmax v20.8h, v20.8h, v1.8h\n"
+      "fmin v21.8h, v21.8h, v0.8h\n"
+      "fmin v22.8h, v22.8h, v0.8h\n"
+      "fmin v23.8h, v23.8h, v0.8h\n"
+      "fmax v21.8h, v21.8h, v1.8h\n"
+      "fmax v22.8h, v22.8h, v1.8h\n"
+      "fmax v23.8h, v23.8h, v1.8h\n"
+      "182:"  // Height 4: No activation
+      "cmp x16, #0x20\n"
+      "bge 199f\n"
+      "tbz x16, #4, 190f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v13.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v17.8h }, [x27], #0x10\n"
+      "st1 { v20.8h }, [x25], #0x10\n"
+      "st1 { v21.8h }, [x25], #0x10\n"
+      "tbz x16, #3, 186f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x9], #0x10\n"
+      "st1 { v18.8h }, [x27], #0x10\n"
+      "st1 { v22.8h }, [x25], #0x10\n"
+      "tbz x16, #2, 184f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "tbz x16, #1, 183f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x9], #0x4\n"
+      "st1 { v19.s }[2], [x27], #0x4\n"
+      "st1 { v23.s }[2], [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x9]\n"
+      "st1 { v19.h }[6], [x27]\n"
+      "st1 { v23.h }[6], [x25]\n"
+      "b 198f\n"
+      "183:"  // Height 4: Partial direct writeback: partial_1_28
+      "tbz x16, #0, 198f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x9]\n"
+      "st1 { v19.h }[4], [x27]\n"
+      "st1 { v23.h }[4], [x25]\n"
+      "b 198f\n"
+      "184:"  // Height 4: Partial direct writeback: partial_2_24
+      "tbz x16, #1, 185f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x9], #0x4\n"
+      "str s19, [x27], #0x4\n"
+      "str s23, [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x9]\n"
+      "st1 { v19.h }[2], [x27]\n"
+      "st1 { v23.h }[2], [x25]\n"
+      "b 198f\n"
+      "185:"  // Height 4: Partial direct writeback: partial_1_24
+      "tbz x16, #0, 198f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x9, #0x0]\n"
+      "str h19, [x27, #0x0]\n"
+      "str h23, [x25, #0x0]\n"
+      "b 198f\n"
+      "186:"  // Height 4: Partial direct writeback: partial_4_16
+      "tbz x16, #2, 188f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "tbz x16, #1, 187f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x9], #0x4\n"
+      "st1 { v18.s }[2], [x27], #0x4\n"
+      "st1 { v22.s }[2], [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x9]\n"
+      "st1 { v18.h }[6], [x27]\n"
+      "st1 { v22.h }[6], [x25]\n"
+      "b 198f\n"
+      "187:"  // Height 4: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 198f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x9]\n"
+      "st1 { v18.h }[4], [x27]\n"
+      "st1 { v22.h }[4], [x25]\n"
+      "b 198f\n"
+      "188:"  // Height 4: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 189f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x9], #0x4\n"
+      "str s18, [x27], #0x4\n"
+      "str s22, [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x9]\n"
+      "st1 { v18.h }[2], [x27]\n"
+      "st1 { v22.h }[2], [x25]\n"
+      "b 198f\n"
+      "189:"  // Height 4: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 198f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x9, #0x0]\n"
+      "str h18, [x27, #0x0]\n"
+      "str h22, [x25, #0x0]\n"
+      "b 198f\n"
+      "190:"  // Height 4: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 194f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v20.8h }, [x25], #0x10\n"
+      "tbz x16, #2, 192f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "tbz x16, #1, 191f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x9], #0x4\n"
+      "st1 { v17.s }[2], [x27], #0x4\n"
+      "st1 { v21.s }[2], [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x9]\n"
+      "st1 { v17.h }[6], [x27]\n"
+      "st1 { v21.h }[6], [x25]\n"
+      "b 198f\n"
+      "191:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 198f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x9]\n"
+      "st1 { v17.h }[4], [x27]\n"
+      "st1 { v21.h }[4], [x25]\n"
+      "b 198f\n"
+      "192:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 193f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x9], #0x4\n"
+      "str s17, [x27], #0x4\n"
+      "str s21, [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x9]\n"
+      "st1 { v17.h }[2], [x27]\n"
+      "st1 { v21.h }[2], [x25]\n"
+      "b 198f\n"
+      "193:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 198f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x9, #0x0]\n"
+      "str h17, [x27, #0x0]\n"
+      "str h21, [x25, #0x0]\n"
+      "b 198f\n"
+      "194:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 196f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x16, #1, 195f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x9]\n"
+      "st1 { v16.h }[6], [x27]\n"
+      "st1 { v20.h }[6], [x25]\n"
+      "b 198f\n"
+      "195:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 198f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x9]\n"
+      "st1 { v16.h }[4], [x27]\n"
+      "st1 { v20.h }[4], [x25]\n"
+      "b 198f\n"
+      "196:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 197f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x9]\n"
+      "st1 { v16.h }[2], [x27]\n"
+      "st1 { v20.h }[2], [x25]\n"
+      "b 198f\n"
+      "197:"  // Height 4: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x9, #0x0]\n"
+      "str h16, [x27, #0x0]\n"
+      "str h20, [x25, #0x0]\n"
+      "198:"  // Height 4: Partial direct writeback: Done
+      "b 200f\n"
+      "199:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "200:"  // Height 4: Writeback done
+      "subs x16, x16, #0x20\n"
+      "bgt 153b\n"
+      "b 302f\n"
+      "201:"  // Height 5
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 202f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #1\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "b 203f\n"
+      "202:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "add x25, x27, x19, LSL #1\n"
+      "add x23, x25, x19, LSL #1\n"
+      "203:"  // Height 5: Column loop
+      "cbz x14, 204f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v24.16b, v8.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "b 223f\n"
+      "204:"  // Height 5: no bias
+      "tbz %x[flags], #0, 222f\n"
+      "cmp x16, #0x20\n"
+      "bge 221f\n"
+      "tbz x16, #4, 212f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v20.8h }, [x25], #0x10\n"
+      "ld1 { v24.8h }, [x23], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x9], #0x10\n"
+      "ld1 { v17.8h }, [x27], #0x10\n"
+      "ld1 { v21.8h }, [x25], #0x10\n"
+      "ld1 { v25.8h }, [x23], #0x10\n"
+      "tbz x16, #3, 208f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x9], #0x10\n"
+      "ld1 { v18.8h }, [x27], #0x10\n"
+      "ld1 { v22.8h }, [x25], #0x10\n"
+      "ld1 { v26.8h }, [x23], #0x10\n"
+      "tbz x16, #2, 206f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "tbz x16, #1, 205f\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x9], #0x4\n"
+      "ld1 { v19.s }[2], [x27], #0x4\n"
+      "ld1 { v23.s }[2], [x25], #0x4\n"
+      "ld1 { v27.s }[2], [x23], #0x4\n"
+      "mov x19, #0x3c\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x9]\n"
+      "ld1 { v19.h }[6], [x27]\n"
+      "ld1 { v23.h }[6], [x25]\n"
+      "ld1 { v27.h }[6], [x23]\n"
+      "b 220f\n"
+      "205:"  // Height 5: Partial accumulate: partial_1_28
+      "mov x19, #0x38\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x9]\n"
+      "ld1 { v19.h }[4], [x27]\n"
+      "ld1 { v23.h }[4], [x25]\n"
+      "ld1 { v27.h }[4], [x23]\n"
+      "b 220f\n"
+      "206:"  // Height 5: Partial accumulate: partial_2_24
+      "tbz x16, #1, 207f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x9], #0x4\n"
+      "ldr s19, [x27], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
+      "ldr s27, [x23], #0x4\n"
+      "mov x19, #0x34\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x9]\n"
+      "ld1 { v19.h }[2], [x27]\n"
+      "ld1 { v23.h }[2], [x25]\n"
+      "ld1 { v27.h }[2], [x23]\n"
+      "b 220f\n"
+      "207:"  // Height 5: Partial accumulate: partial_1_24
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 220f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x9, #0x0]\n"
+      "ldr h19, [x27, #0x0]\n"
+      "ldr h23, [x25, #0x0]\n"
+      "ldr h27, [x23, #0x0]\n"
+      "b 220f\n"
+      "208:"  // Height 5: Partial accumulate: partial_4_16
+      "tbz x16, #2, 210f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "tbz x16, #1, 209f\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x9], #0x4\n"
+      "ld1 { v18.s }[2], [x27], #0x4\n"
+      "ld1 { v22.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x23], #0x4\n"
+      "mov x19, #0x2c\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x9]\n"
+      "ld1 { v18.h }[6], [x27]\n"
+      "ld1 { v22.h }[6], [x25]\n"
+      "ld1 { v26.h }[6], [x23]\n"
+      "b 220f\n"
+      "209:"  // Height 5: Partial accumulate: partial_1_20
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x9]\n"
+      "ld1 { v18.h }[4], [x27]\n"
+      "ld1 { v22.h }[4], [x25]\n"
+      "ld1 { v26.h }[4], [x23]\n"
+      "b 220f\n"
+      "210:"  // Height 5: Partial accumulate: partial_2_16
+      "tbz x16, #1, 211f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x9], #0x4\n"
+      "ldr s18, [x27], #0x4\n"
+      "ldr s22, [x25], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "mov x19, #0x24\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x9]\n"
+      "ld1 { v18.h }[2], [x27]\n"
+      "ld1 { v22.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x23]\n"
+      "b 220f\n"
+      "211:"  // Height 5: Partial accumulate: partial_1_16
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 220f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x9, #0x0]\n"
+      "ldr h18, [x27, #0x0]\n"
+      "ldr h22, [x25, #0x0]\n"
+      "ldr h26, [x23, #0x0]\n"
+      "b 220f\n"
+      "212:"  // Height 5: Partial accumulate: partial_8_0
+      "tbz x16, #3, 216f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v20.8h }, [x25], #0x10\n"
+      "ld1 { v24.8h }, [x23], #0x10\n"
+      "tbz x16, #2, 214f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "tbz x16, #1, 213f\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x9], #0x4\n"
+      "ld1 { v17.s }[2], [x27], #0x4\n"
+      "ld1 { v21.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x23], #0x4\n"
+      "mov x19, #0x1c\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x9]\n"
+      "ld1 { v17.h }[6], [x27]\n"
+      "ld1 { v21.h }[6], [x25]\n"
+      "ld1 { v25.h }[6], [x23]\n"
+      "b 220f\n"
+      "213:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x19, #0x18\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x9]\n"
+      "ld1 { v17.h }[4], [x27]\n"
+      "ld1 { v21.h }[4], [x25]\n"
+      "ld1 { v25.h }[4], [x23]\n"
+      "b 220f\n"
+      "214:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x16, #1, 215f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x9], #0x4\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s21, [x25], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "mov x19, #0x14\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x9]\n"
+      "ld1 { v17.h }[2], [x27]\n"
+      "ld1 { v21.h }[2], [x25]\n"
+      "ld1 { v25.h }[2], [x23]\n"
+      "b 220f\n"
+      "215:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 220f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x9, #0x0]\n"
+      "ldr h17, [x27, #0x0]\n"
+      "ldr h21, [x25, #0x0]\n"
+      "ldr h25, [x23, #0x0]\n"
+      "b 220f\n"
+      "216:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x16, #2, 218f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "tbz x16, #1, 217f\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x9], #0x4\n"
+      "ld1 { v16.s }[2], [x27], #0x4\n"
+      "ld1 { v20.s }[2], [x25], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "mov x19, #0xc\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x9]\n"
+      "ld1 { v16.h }[6], [x27]\n"
+      "ld1 { v20.h }[6], [x25]\n"
+      "ld1 { v24.h }[6], [x23]\n"
+      "b 220f\n"
+      "217:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x9]\n"
+      "ld1 { v16.h }[4], [x27]\n"
+      "ld1 { v20.h }[4], [x25]\n"
+      "ld1 { v24.h }[4], [x23]\n"
+      "b 220f\n"
+      "218:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x16, #1, 219f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x9], #0x4\n"
+      "ldr s16, [x27], #0x4\n"
+      "ldr s20, [x25], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "mov x19, #0x4\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x9]\n"
+      "ld1 { v16.h }[2], [x27]\n"
+      "ld1 { v20.h }[2], [x25]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "b 220f\n"
+      "219:"  // Height 5: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x9, #0x0]\n"
+      "ldr h16, [x27, #0x0]\n"
+      "ldr h20, [x25, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "220:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "b 223f\n"
+      "221:"  // Height 5: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "b 223f\n"
+      "222:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "223:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "224:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 225f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 226f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "b 226f\n"
+      "225:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "226:"  // Height 5: input setup done
+      "cmp x11, #0x8\n"
+      "blt 229f\n"
+      "cmp x11, #0x10\n"
+      "blt 228f\n"
+      "227:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x11, x11, #0x8\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "cmp x11, #0x10\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "fmla v24.8h, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "fmla v25.8h, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "fmla v26.8h, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "fmla v27.8h, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "fmla v24.8h, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "fmla v25.8h, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "fmla v26.8h, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "fmla v27.8h, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "fmla v24.8h, v6.8h, v4.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "fmla v25.8h, v7.8h, v4.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "fmla v26.8h, v6.8h, v4.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "fmla v27.8h, v7.8h, v4.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "fmla v24.8h, v6.8h, v4.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "fmla v25.8h, v7.8h, v4.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "fmla v26.8h, v6.8h, v4.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "fmla v27.8h, v7.8h, v4.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "fmla v24.8h, v6.8h, v4.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "fmla v25.8h, v7.8h, v4.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "fmla v26.8h, v6.8h, v4.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "fmla v27.8h, v7.8h, v4.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "fmla v24.8h, v6.8h, v4.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "fmla v25.8h, v7.8h, v4.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "fmla v26.8h, v6.8h, v4.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "fmla v27.8h, v7.8h, v4.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "fmla v24.8h, v6.8h, v4.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "fmla v25.8h, v7.8h, v4.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "bge 227b\n"
+      "228:"  // Height 5: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "fmla v24.8h, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "fmla v25.8h, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "fmla v26.8h, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "fmla v27.8h, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "fmla v24.8h, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "fmla v25.8h, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "fmla v26.8h, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "fmla v27.8h, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "fmla v24.8h, v6.8h, v4.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "fmla v25.8h, v7.8h, v4.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "fmla v26.8h, v6.8h, v4.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "fmla v27.8h, v7.8h, v4.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "fmla v24.8h, v6.8h, v4.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "fmla v25.8h, v7.8h, v4.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "fmla v26.8h, v6.8h, v4.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "fmla v27.8h, v7.8h, v4.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "fmla v24.8h, v6.8h, v4.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "fmla v25.8h, v7.8h, v4.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "fmla v26.8h, v6.8h, v4.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "fmla v27.8h, v7.8h, v4.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "fmla v24.8h, v6.8h, v4.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "fmla v25.8h, v7.8h, v4.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "fmla v26.8h, v6.8h, v4.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "fmla v27.8h, v7.8h, v4.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "fmla v24.8h, v6.8h, v4.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "fmla v25.8h, v7.8h, v4.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "229:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x11, 231f\n"
+      "230:"  // Height 5: Multiply loop: Odd block loop
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "cbnz x11, 230b\n"
+      "231:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 224b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 232f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.8h }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.8h }, [x19]\n"
+      "fmin v8.8h, v8.8h, v0.8h\n"
+      "fmin v9.8h, v9.8h, v0.8h\n"
+      "fmin v10.8h, v10.8h, v0.8h\n"
+      "fmin v11.8h, v11.8h, v0.8h\n"
+      "fmax v8.8h, v8.8h, v1.8h\n"
+      "fmax v9.8h, v9.8h, v1.8h\n"
+      "fmax v10.8h, v10.8h, v1.8h\n"
+      "fmax v11.8h, v11.8h, v1.8h\n"
+      "fmin v12.8h, v12.8h, v0.8h\n"
+      "fmin v13.8h, v13.8h, v0.8h\n"
+      "fmin v14.8h, v14.8h, v0.8h\n"
+      "fmax v12.8h, v12.8h, v1.8h\n"
+      "fmax v13.8h, v13.8h, v1.8h\n"
+      "fmax v14.8h, v14.8h, v1.8h\n"
+      "fmin v15.8h, v15.8h, v0.8h\n"
+      "fmin v16.8h, v16.8h, v0.8h\n"
+      "fmin v17.8h, v17.8h, v0.8h\n"
+      "fmax v15.8h, v15.8h, v1.8h\n"
+      "fmax v16.8h, v16.8h, v1.8h\n"
+      "fmax v17.8h, v17.8h, v1.8h\n"
+      "fmin v18.8h, v18.8h, v0.8h\n"
+      "fmin v19.8h, v19.8h, v0.8h\n"
+      "fmin v20.8h, v20.8h, v0.8h\n"
+      "fmax v18.8h, v18.8h, v1.8h\n"
+      "fmax v19.8h, v19.8h, v1.8h\n"
+      "fmax v20.8h, v20.8h, v1.8h\n"
+      "fmin v21.8h, v21.8h, v0.8h\n"
+      "fmin v22.8h, v22.8h, v0.8h\n"
+      "fmin v23.8h, v23.8h, v0.8h\n"
+      "fmax v21.8h, v21.8h, v1.8h\n"
+      "fmax v22.8h, v22.8h, v1.8h\n"
+      "fmax v23.8h, v23.8h, v1.8h\n"
+      "fmin v24.8h, v24.8h, v0.8h\n"
+      "fmin v25.8h, v25.8h, v0.8h\n"
+      "fmin v26.8h, v26.8h, v0.8h\n"
+      "fmax v24.8h, v24.8h, v1.8h\n"
+      "fmax v25.8h, v25.8h, v1.8h\n"
+      "fmax v26.8h, v26.8h, v1.8h\n"
+      "fmin v27.8h, v27.8h, v0.8h\n"
+      "fmax v27.8h, v27.8h, v1.8h\n"
+      "232:"  // Height 5: No activation
+      "cmp x16, #0x20\n"
+      "bge 249f\n"
+      "tbz x16, #4, 240f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v13.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v17.8h }, [x27], #0x10\n"
+      "st1 { v20.8h }, [x25], #0x10\n"
+      "st1 { v21.8h }, [x25], #0x10\n"
+      "st1 { v24.8h }, [x23], #0x10\n"
+      "st1 { v25.8h }, [x23], #0x10\n"
+      "tbz x16, #3, 236f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x9], #0x10\n"
+      "st1 { v18.8h }, [x27], #0x10\n"
+      "st1 { v22.8h }, [x25], #0x10\n"
+      "st1 { v26.8h }, [x23], #0x10\n"
+      "tbz x16, #2, 234f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "tbz x16, #1, 233f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x9], #0x4\n"
+      "st1 { v19.s }[2], [x27], #0x4\n"
+      "st1 { v23.s }[2], [x25], #0x4\n"
+      "st1 { v27.s }[2], [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x9]\n"
+      "st1 { v19.h }[6], [x27]\n"
+      "st1 { v23.h }[6], [x25]\n"
+      "st1 { v27.h }[6], [x23]\n"
+      "b 248f\n"
+      "233:"  // Height 5: Partial direct writeback: partial_1_28
+      "tbz x16, #0, 248f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x9]\n"
+      "st1 { v19.h }[4], [x27]\n"
+      "st1 { v23.h }[4], [x25]\n"
+      "st1 { v27.h }[4], [x23]\n"
+      "b 248f\n"
+      "234:"  // Height 5: Partial direct writeback: partial_2_24
+      "tbz x16, #1, 235f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x9], #0x4\n"
+      "str s19, [x27], #0x4\n"
+      "str s23, [x25], #0x4\n"
+      "str s27, [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x9]\n"
+      "st1 { v19.h }[2], [x27]\n"
+      "st1 { v23.h }[2], [x25]\n"
+      "st1 { v27.h }[2], [x23]\n"
+      "b 248f\n"
+      "235:"  // Height 5: Partial direct writeback: partial_1_24
+      "tbz x16, #0, 248f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x9, #0x0]\n"
+      "str h19, [x27, #0x0]\n"
+      "str h23, [x25, #0x0]\n"
+      "str h27, [x23, #0x0]\n"
+      "b 248f\n"
+      "236:"  // Height 5: Partial direct writeback: partial_4_16
+      "tbz x16, #2, 238f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "tbz x16, #1, 237f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x9], #0x4\n"
+      "st1 { v18.s }[2], [x27], #0x4\n"
+      "st1 { v22.s }[2], [x25], #0x4\n"
+      "st1 { v26.s }[2], [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x9]\n"
+      "st1 { v18.h }[6], [x27]\n"
+      "st1 { v22.h }[6], [x25]\n"
+      "st1 { v26.h }[6], [x23]\n"
+      "b 248f\n"
+      "237:"  // Height 5: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 248f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x9]\n"
+      "st1 { v18.h }[4], [x27]\n"
+      "st1 { v22.h }[4], [x25]\n"
+      "st1 { v26.h }[4], [x23]\n"
+      "b 248f\n"
+      "238:"  // Height 5: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 239f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x9], #0x4\n"
+      "str s18, [x27], #0x4\n"
+      "str s22, [x25], #0x4\n"
+      "str s26, [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x9]\n"
+      "st1 { v18.h }[2], [x27]\n"
+      "st1 { v22.h }[2], [x25]\n"
+      "st1 { v26.h }[2], [x23]\n"
+      "b 248f\n"
+      "239:"  // Height 5: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 248f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x9, #0x0]\n"
+      "str h18, [x27, #0x0]\n"
+      "str h22, [x25, #0x0]\n"
+      "str h26, [x23, #0x0]\n"
+      "b 248f\n"
+      "240:"  // Height 5: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 244f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v20.8h }, [x25], #0x10\n"
+      "st1 { v24.8h }, [x23], #0x10\n"
+      "tbz x16, #2, 242f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "tbz x16, #1, 241f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x9], #0x4\n"
+      "st1 { v17.s }[2], [x27], #0x4\n"
+      "st1 { v21.s }[2], [x25], #0x4\n"
+      "st1 { v25.s }[2], [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x9]\n"
+      "st1 { v17.h }[6], [x27]\n"
+      "st1 { v21.h }[6], [x25]\n"
+      "st1 { v25.h }[6], [x23]\n"
+      "b 248f\n"
+      "241:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 248f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x9]\n"
+      "st1 { v17.h }[4], [x27]\n"
+      "st1 { v21.h }[4], [x25]\n"
+      "st1 { v25.h }[4], [x23]\n"
+      "b 248f\n"
+      "242:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 243f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x9], #0x4\n"
+      "str s17, [x27], #0x4\n"
+      "str s21, [x25], #0x4\n"
+      "str s25, [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x9]\n"
+      "st1 { v17.h }[2], [x27]\n"
+      "st1 { v21.h }[2], [x25]\n"
+      "st1 { v25.h }[2], [x23]\n"
+      "b 248f\n"
+      "243:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 248f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x9, #0x0]\n"
+      "str h17, [x27, #0x0]\n"
+      "str h21, [x25, #0x0]\n"
+      "str h25, [x23, #0x0]\n"
+      "b 248f\n"
+      "244:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 246f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x16, #1, 245f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x9]\n"
+      "st1 { v16.h }[6], [x27]\n"
+      "st1 { v20.h }[6], [x25]\n"
+      "st1 { v24.h }[6], [x23]\n"
+      "b 248f\n"
+      "245:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 248f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x9]\n"
+      "st1 { v16.h }[4], [x27]\n"
+      "st1 { v20.h }[4], [x25]\n"
+      "st1 { v24.h }[4], [x23]\n"
+      "b 248f\n"
+      "246:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 247f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x9]\n"
+      "st1 { v16.h }[2], [x27]\n"
+      "st1 { v20.h }[2], [x25]\n"
+      "st1 { v24.h }[2], [x23]\n"
+      "b 248f\n"
+      "247:"  // Height 5: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x9, #0x0]\n"
+      "str h16, [x27, #0x0]\n"
+      "str h20, [x25, #0x0]\n"
+      "str h24, [x23, #0x0]\n"
+      "248:"  // Height 5: Partial direct writeback: Done
+      "b 250f\n"
+      "249:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "250:"  // Height 5: Writeback done
+      "subs x16, x16, #0x20\n"
+      "bgt 203b\n"
+      "b 302f\n"
+      "251:"  // Height 6
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 252f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #1\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "add x21, x21, x19, LSL #1\n"
+      "b 253f\n"
+      "252:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "add x25, x27, x19, LSL #1\n"
+      "add x23, x25, x19, LSL #1\n"
+      "add x21, x23, x19, LSL #1\n"
+      "add %x[output_ptr], x21, x19, LSL #1\n"
+      "253:"  // Height 6: Column loop
+      "cbz x14, 254f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v24.16b, v8.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v28.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v29.16b, v9.16b\n"
+      "mov v30.16b, v10.16b\n"
+      "mov v31.16b, v11.16b\n"
+      "b 273f\n"
+      "254:"  // Height 6: no bias
+      "tbz %x[flags], #0, 272f\n"
+      "cmp x16, #0x20\n"
+      "bge 271f\n"
+      "tbz x16, #4, 262f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v20.8h }, [x25], #0x10\n"
+      "ld1 { v24.8h }, [x23], #0x10\n"
+      "ld1 { v28.8h }, [x21], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x9], #0x10\n"
+      "ld1 { v17.8h }, [x27], #0x10\n"
+      "ld1 { v21.8h }, [x25], #0x10\n"
+      "ld1 { v25.8h }, [x23], #0x10\n"
+      "ld1 { v29.8h }, [x21], #0x10\n"
+      "tbz x16, #3, 258f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x9], #0x10\n"
+      "ld1 { v18.8h }, [x27], #0x10\n"
+      "ld1 { v22.8h }, [x25], #0x10\n"
+      "ld1 { v26.8h }, [x23], #0x10\n"
+      "ld1 { v30.8h }, [x21], #0x10\n"
+      "tbz x16, #2, 256f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x16, #1, 255f\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x9], #0x4\n"
+      "ld1 { v19.s }[2], [x27], #0x4\n"
+      "ld1 { v23.s }[2], [x25], #0x4\n"
+      "ld1 { v27.s }[2], [x23], #0x4\n"
+      "ld1 { v31.s }[2], [x21], #0x4\n"
+      "mov x19, #0x3c\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x9]\n"
+      "ld1 { v19.h }[6], [x27]\n"
+      "ld1 { v23.h }[6], [x25]\n"
+      "ld1 { v27.h }[6], [x23]\n"
+      "ld1 { v31.h }[6], [x21]\n"
+      "b 270f\n"
+      "255:"  // Height 6: Partial accumulate: partial_1_28
+      "mov x19, #0x38\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x9]\n"
+      "ld1 { v19.h }[4], [x27]\n"
+      "ld1 { v23.h }[4], [x25]\n"
+      "ld1 { v27.h }[4], [x23]\n"
+      "ld1 { v31.h }[4], [x21]\n"
+      "b 270f\n"
+      "256:"  // Height 6: Partial accumulate: partial_2_24
+      "tbz x16, #1, 257f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x9], #0x4\n"
+      "ldr s19, [x27], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
+      "ldr s27, [x23], #0x4\n"
+      "ldr s31, [x21], #0x4\n"
+      "mov x19, #0x34\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x9]\n"
+      "ld1 { v19.h }[2], [x27]\n"
+      "ld1 { v23.h }[2], [x25]\n"
+      "ld1 { v27.h }[2], [x23]\n"
+      "ld1 { v31.h }[2], [x21]\n"
+      "b 270f\n"
+      "257:"  // Height 6: Partial accumulate: partial_1_24
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 270f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x9, #0x0]\n"
+      "ldr h19, [x27, #0x0]\n"
+      "ldr h23, [x25, #0x0]\n"
+      "ldr h27, [x23, #0x0]\n"
+      "ldr h31, [x21, #0x0]\n"
+      "b 270f\n"
+      "258:"  // Height 6: Partial accumulate: partial_4_16
+      "tbz x16, #2, 260f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "tbz x16, #1, 259f\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x9], #0x4\n"
+      "ld1 { v18.s }[2], [x27], #0x4\n"
+      "ld1 { v22.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x23], #0x4\n"
+      "ld1 { v30.s }[2], [x21], #0x4\n"
+      "mov x19, #0x2c\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x9]\n"
+      "ld1 { v18.h }[6], [x27]\n"
+      "ld1 { v22.h }[6], [x25]\n"
+      "ld1 { v26.h }[6], [x23]\n"
+      "ld1 { v30.h }[6], [x21]\n"
+      "b 270f\n"
+      "259:"  // Height 6: Partial accumulate: partial_1_20
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x9]\n"
+      "ld1 { v18.h }[4], [x27]\n"
+      "ld1 { v22.h }[4], [x25]\n"
+      "ld1 { v26.h }[4], [x23]\n"
+      "ld1 { v30.h }[4], [x21]\n"
+      "b 270f\n"
+      "260:"  // Height 6: Partial accumulate: partial_2_16
+      "tbz x16, #1, 261f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x9], #0x4\n"
+      "ldr s18, [x27], #0x4\n"
+      "ldr s22, [x25], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr s30, [x21], #0x4\n"
+      "mov x19, #0x24\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x9]\n"
+      "ld1 { v18.h }[2], [x27]\n"
+      "ld1 { v22.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x23]\n"
+      "ld1 { v30.h }[2], [x21]\n"
+      "b 270f\n"
+      "261:"  // Height 6: Partial accumulate: partial_1_16
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 270f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x9, #0x0]\n"
+      "ldr h18, [x27, #0x0]\n"
+      "ldr h22, [x25, #0x0]\n"
+      "ldr h26, [x23, #0x0]\n"
+      "ldr h30, [x21, #0x0]\n"
+      "b 270f\n"
+      "262:"  // Height 6: Partial accumulate: partial_8_0
+      "tbz x16, #3, 266f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v20.8h }, [x25], #0x10\n"
+      "ld1 { v24.8h }, [x23], #0x10\n"
+      "ld1 { v28.8h }, [x21], #0x10\n"
+      "tbz x16, #2, 264f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x16, #1, 263f\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x9], #0x4\n"
+      "ld1 { v17.s }[2], [x27], #0x4\n"
+      "ld1 { v21.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x23], #0x4\n"
+      "ld1 { v29.s }[2], [x21], #0x4\n"
+      "mov x19, #0x1c\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x9]\n"
+      "ld1 { v17.h }[6], [x27]\n"
+      "ld1 { v21.h }[6], [x25]\n"
+      "ld1 { v25.h }[6], [x23]\n"
+      "ld1 { v29.h }[6], [x21]\n"
+      "b 270f\n"
+      "263:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x19, #0x18\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x9]\n"
+      "ld1 { v17.h }[4], [x27]\n"
+      "ld1 { v21.h }[4], [x25]\n"
+      "ld1 { v25.h }[4], [x23]\n"
+      "ld1 { v29.h }[4], [x21]\n"
+      "b 270f\n"
+      "264:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x16, #1, 265f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x9], #0x4\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s21, [x25], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s29, [x21], #0x4\n"
+      "mov x19, #0x14\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x9]\n"
+      "ld1 { v17.h }[2], [x27]\n"
+      "ld1 { v21.h }[2], [x25]\n"
+      "ld1 { v25.h }[2], [x23]\n"
+      "ld1 { v29.h }[2], [x21]\n"
+      "b 270f\n"
+      "265:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 270f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x9, #0x0]\n"
+      "ldr h17, [x27, #0x0]\n"
+      "ldr h21, [x25, #0x0]\n"
+      "ldr h25, [x23, #0x0]\n"
+      "ldr h29, [x21, #0x0]\n"
+      "b 270f\n"
+      "266:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x16, #2, 268f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "tbz x16, #1, 267f\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x9], #0x4\n"
+      "ld1 { v16.s }[2], [x27], #0x4\n"
+      "ld1 { v20.s }[2], [x25], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v28.s }[2], [x21], #0x4\n"
+      "mov x19, #0xc\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x9]\n"
+      "ld1 { v16.h }[6], [x27]\n"
+      "ld1 { v20.h }[6], [x25]\n"
+      "ld1 { v24.h }[6], [x23]\n"
+      "ld1 { v28.h }[6], [x21]\n"
+      "b 270f\n"
+      "267:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x9]\n"
+      "ld1 { v16.h }[4], [x27]\n"
+      "ld1 { v20.h }[4], [x25]\n"
+      "ld1 { v24.h }[4], [x23]\n"
+      "ld1 { v28.h }[4], [x21]\n"
+      "b 270f\n"
+      "268:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x16, #1, 269f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x9], #0x4\n"
+      "ldr s16, [x27], #0x4\n"
+      "ldr s20, [x25], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s28, [x21], #0x4\n"
+      "mov x19, #0x4\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x9]\n"
+      "ld1 { v16.h }[2], [x27]\n"
+      "ld1 { v20.h }[2], [x25]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "ld1 { v28.h }[2], [x21]\n"
+      "b 270f\n"
+      "269:"  // Height 6: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x9, #0x0]\n"
+      "ldr h16, [x27, #0x0]\n"
+      "ldr h20, [x25, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "ldr h28, [x21, #0x0]\n"
+      "270:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "sub x21, x21, x19\n"
+      "b 273f\n"
+      "271:"  // Height 6: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "b 273f\n"
+      "272:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "273:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "274:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 275f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 276f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x20, x20, x19, LSL #1\n"
+      "b 276f\n"
+      "275:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "add x20, x22, x19, LSL #1\n"
+      "276:"  // Height 6: input setup done
+      "cmp x11, #0x8\n"
+      "blt 279f\n"
+      "cmp x11, #0x10\n"
+      "blt 278f\n"
+      "277:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v28.8h, v6.8h, v5.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x11, x11, #0x8\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "cmp x11, #0x10\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "fmla v29.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "fmla v30.8h, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "fmla v31.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "fmla v24.8h, v6.8h, v4.h[1]\n"
+      "fmla v28.8h, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "fmla v25.8h, v7.8h, v4.h[1]\n"
+      "fmla v29.8h, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "fmla v26.8h, v6.8h, v4.h[1]\n"
+      "fmla v30.8h, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "fmla v27.8h, v7.8h, v4.h[1]\n"
+      "fmla v31.8h, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "fmla v24.8h, v6.8h, v4.h[2]\n"
+      "fmla v28.8h, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "fmla v25.8h, v7.8h, v4.h[2]\n"
+      "fmla v29.8h, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "fmla v26.8h, v6.8h, v4.h[2]\n"
+      "fmla v30.8h, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "fmla v27.8h, v7.8h, v4.h[2]\n"
+      "fmla v31.8h, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "fmla v24.8h, v6.8h, v4.h[3]\n"
+      "fmla v28.8h, v6.8h, v5.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "fmla v25.8h, v7.8h, v4.h[3]\n"
+      "fmla v29.8h, v7.8h, v5.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "fmla v26.8h, v6.8h, v4.h[3]\n"
+      "fmla v30.8h, v6.8h, v5.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "fmla v27.8h, v7.8h, v4.h[3]\n"
+      "fmla v31.8h, v7.8h, v5.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "fmla v24.8h, v6.8h, v4.h[4]\n"
+      "fmla v28.8h, v6.8h, v5.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "fmla v25.8h, v7.8h, v4.h[4]\n"
+      "fmla v29.8h, v7.8h, v5.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "fmla v26.8h, v6.8h, v4.h[4]\n"
+      "fmla v30.8h, v6.8h, v5.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "fmla v27.8h, v7.8h, v4.h[4]\n"
+      "fmla v31.8h, v7.8h, v5.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "fmla v24.8h, v6.8h, v4.h[5]\n"
+      "fmla v28.8h, v6.8h, v5.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "fmla v25.8h, v7.8h, v4.h[5]\n"
+      "fmla v29.8h, v7.8h, v5.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "fmla v26.8h, v6.8h, v4.h[5]\n"
+      "fmla v30.8h, v6.8h, v5.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "fmla v27.8h, v7.8h, v4.h[5]\n"
+      "fmla v31.8h, v7.8h, v5.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "fmla v24.8h, v6.8h, v4.h[6]\n"
+      "fmla v28.8h, v6.8h, v5.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "fmla v25.8h, v7.8h, v4.h[6]\n"
+      "fmla v29.8h, v7.8h, v5.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "fmla v26.8h, v6.8h, v4.h[6]\n"
+      "fmla v30.8h, v6.8h, v5.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "fmla v27.8h, v7.8h, v4.h[6]\n"
+      "fmla v31.8h, v7.8h, v5.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "fmla v24.8h, v6.8h, v4.h[7]\n"
+      "fmla v28.8h, v6.8h, v5.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "fmla v25.8h, v7.8h, v4.h[7]\n"
+      "fmla v29.8h, v7.8h, v5.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v30.8h, v6.8h, v5.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "fmla v31.8h, v7.8h, v5.h[7]\n"
+      "bge 277b\n"
+      "278:"  // Height 6: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v28.8h, v6.8h, v5.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "fmla v29.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "fmla v30.8h, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "fmla v31.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "fmla v24.8h, v6.8h, v4.h[1]\n"
+      "fmla v28.8h, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "fmla v25.8h, v7.8h, v4.h[1]\n"
+      "fmla v29.8h, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "fmla v26.8h, v6.8h, v4.h[1]\n"
+      "fmla v30.8h, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "fmla v27.8h, v7.8h, v4.h[1]\n"
+      "fmla v31.8h, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "fmla v24.8h, v6.8h, v4.h[2]\n"
+      "fmla v28.8h, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "fmla v25.8h, v7.8h, v4.h[2]\n"
+      "fmla v29.8h, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "fmla v26.8h, v6.8h, v4.h[2]\n"
+      "fmla v30.8h, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "fmla v27.8h, v7.8h, v4.h[2]\n"
+      "fmla v31.8h, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "fmla v24.8h, v6.8h, v4.h[3]\n"
+      "fmla v28.8h, v6.8h, v5.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "fmla v25.8h, v7.8h, v4.h[3]\n"
+      "fmla v29.8h, v7.8h, v5.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "fmla v26.8h, v6.8h, v4.h[3]\n"
+      "fmla v30.8h, v6.8h, v5.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "fmla v27.8h, v7.8h, v4.h[3]\n"
+      "fmla v31.8h, v7.8h, v5.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "fmla v24.8h, v6.8h, v4.h[4]\n"
+      "fmla v28.8h, v6.8h, v5.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "fmla v25.8h, v7.8h, v4.h[4]\n"
+      "fmla v29.8h, v7.8h, v5.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "fmla v26.8h, v6.8h, v4.h[4]\n"
+      "fmla v30.8h, v6.8h, v5.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "fmla v27.8h, v7.8h, v4.h[4]\n"
+      "fmla v31.8h, v7.8h, v5.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "fmla v24.8h, v6.8h, v4.h[5]\n"
+      "fmla v28.8h, v6.8h, v5.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "fmla v25.8h, v7.8h, v4.h[5]\n"
+      "fmla v29.8h, v7.8h, v5.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "fmla v26.8h, v6.8h, v4.h[5]\n"
+      "fmla v30.8h, v6.8h, v5.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "fmla v27.8h, v7.8h, v4.h[5]\n"
+      "fmla v31.8h, v7.8h, v5.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "fmla v24.8h, v6.8h, v4.h[6]\n"
+      "fmla v28.8h, v6.8h, v5.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "fmla v25.8h, v7.8h, v4.h[6]\n"
+      "fmla v29.8h, v7.8h, v5.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "fmla v26.8h, v6.8h, v4.h[6]\n"
+      "fmla v30.8h, v6.8h, v5.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "fmla v27.8h, v7.8h, v4.h[6]\n"
+      "fmla v31.8h, v7.8h, v5.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "fmla v24.8h, v6.8h, v4.h[7]\n"
+      "fmla v28.8h, v6.8h, v5.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "fmla v25.8h, v7.8h, v4.h[7]\n"
+      "fmla v29.8h, v7.8h, v5.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v30.8h, v6.8h, v5.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "fmla v31.8h, v7.8h, v5.h[7]\n"
+      "279:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x11, 281f\n"
+      "280:"  // Height 6: Multiply loop: Odd block loop
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x20], #0x2\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "fmla v28.8h, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "fmla v29.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "fmla v30.8h, v6.8h, v5.h[0]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "fmla v31.8h, v7.8h, v5.h[0]\n"
+      "cbnz x11, 280b\n"
+      "281:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 274b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 282f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.8h }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.8h }, [x19]\n"
+      "fmin v8.8h, v8.8h, v0.8h\n"
+      "fmin v9.8h, v9.8h, v0.8h\n"
+      "fmin v10.8h, v10.8h, v0.8h\n"
+      "fmin v11.8h, v11.8h, v0.8h\n"
+      "fmax v8.8h, v8.8h, v1.8h\n"
+      "fmax v9.8h, v9.8h, v1.8h\n"
+      "fmax v10.8h, v10.8h, v1.8h\n"
+      "fmax v11.8h, v11.8h, v1.8h\n"
+      "fmin v12.8h, v12.8h, v0.8h\n"
+      "fmin v13.8h, v13.8h, v0.8h\n"
+      "fmin v14.8h, v14.8h, v0.8h\n"
+      "fmax v12.8h, v12.8h, v1.8h\n"
+      "fmax v13.8h, v13.8h, v1.8h\n"
+      "fmax v14.8h, v14.8h, v1.8h\n"
+      "fmin v15.8h, v15.8h, v0.8h\n"
+      "fmin v16.8h, v16.8h, v0.8h\n"
+      "fmin v17.8h, v17.8h, v0.8h\n"
+      "fmax v15.8h, v15.8h, v1.8h\n"
+      "fmax v16.8h, v16.8h, v1.8h\n"
+      "fmax v17.8h, v17.8h, v1.8h\n"
+      "fmin v18.8h, v18.8h, v0.8h\n"
+      "fmin v19.8h, v19.8h, v0.8h\n"
+      "fmin v20.8h, v20.8h, v0.8h\n"
+      "fmax v18.8h, v18.8h, v1.8h\n"
+      "fmax v19.8h, v19.8h, v1.8h\n"
+      "fmax v20.8h, v20.8h, v1.8h\n"
+      "fmin v21.8h, v21.8h, v0.8h\n"
+      "fmin v22.8h, v22.8h, v0.8h\n"
+      "fmin v23.8h, v23.8h, v0.8h\n"
+      "fmax v21.8h, v21.8h, v1.8h\n"
+      "fmax v22.8h, v22.8h, v1.8h\n"
+      "fmax v23.8h, v23.8h, v1.8h\n"
+      "fmin v24.8h, v24.8h, v0.8h\n"
+      "fmin v25.8h, v25.8h, v0.8h\n"
+      "fmin v26.8h, v26.8h, v0.8h\n"
+      "fmax v24.8h, v24.8h, v1.8h\n"
+      "fmax v25.8h, v25.8h, v1.8h\n"
+      "fmax v26.8h, v26.8h, v1.8h\n"
+      "fmin v27.8h, v27.8h, v0.8h\n"
+      "fmin v28.8h, v28.8h, v0.8h\n"
+      "fmin v29.8h, v29.8h, v0.8h\n"
+      "fmax v27.8h, v27.8h, v1.8h\n"
+      "fmax v28.8h, v28.8h, v1.8h\n"
+      "fmax v29.8h, v29.8h, v1.8h\n"
+      "fmin v30.8h, v30.8h, v0.8h\n"
+      "fmin v31.8h, v31.8h, v0.8h\n"
+      "fmax v30.8h, v30.8h, v1.8h\n"
+      "fmax v31.8h, v31.8h, v1.8h\n"
+      "282:"  // Height 6: No activation
+      "cmp x16, #0x20\n"
+      "bge 299f\n"
+      "tbz x16, #4, 290f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v13.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v17.8h }, [x27], #0x10\n"
+      "st1 { v20.8h }, [x25], #0x10\n"
+      "st1 { v21.8h }, [x25], #0x10\n"
+      "st1 { v24.8h }, [x23], #0x10\n"
+      "st1 { v25.8h }, [x23], #0x10\n"
+      "st1 { v28.8h }, [x21], #0x10\n"
+      "st1 { v29.8h }, [x21], #0x10\n"
+      "tbz x16, #3, 286f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x9], #0x10\n"
+      "st1 { v18.8h }, [x27], #0x10\n"
+      "st1 { v22.8h }, [x25], #0x10\n"
+      "st1 { v26.8h }, [x23], #0x10\n"
+      "st1 { v30.8h }, [x21], #0x10\n"
+      "tbz x16, #2, 284f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x16, #1, 283f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x9], #0x4\n"
+      "st1 { v19.s }[2], [x27], #0x4\n"
+      "st1 { v23.s }[2], [x25], #0x4\n"
+      "st1 { v27.s }[2], [x23], #0x4\n"
+      "st1 { v31.s }[2], [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x9]\n"
+      "st1 { v19.h }[6], [x27]\n"
+      "st1 { v23.h }[6], [x25]\n"
+      "st1 { v27.h }[6], [x23]\n"
+      "st1 { v31.h }[6], [x21]\n"
+      "b 298f\n"
+      "283:"  // Height 6: Partial direct writeback: partial_1_28
+      "tbz x16, #0, 298f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x9]\n"
+      "st1 { v19.h }[4], [x27]\n"
+      "st1 { v23.h }[4], [x25]\n"
+      "st1 { v27.h }[4], [x23]\n"
+      "st1 { v31.h }[4], [x21]\n"
+      "b 298f\n"
+      "284:"  // Height 6: Partial direct writeback: partial_2_24
+      "tbz x16, #1, 285f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x9], #0x4\n"
+      "str s19, [x27], #0x4\n"
+      "str s23, [x25], #0x4\n"
+      "str s27, [x23], #0x4\n"
+      "str s31, [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x9]\n"
+      "st1 { v19.h }[2], [x27]\n"
+      "st1 { v23.h }[2], [x25]\n"
+      "st1 { v27.h }[2], [x23]\n"
+      "st1 { v31.h }[2], [x21]\n"
+      "b 298f\n"
+      "285:"  // Height 6: Partial direct writeback: partial_1_24
+      "tbz x16, #0, 298f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x9, #0x0]\n"
+      "str h19, [x27, #0x0]\n"
+      "str h23, [x25, #0x0]\n"
+      "str h27, [x23, #0x0]\n"
+      "str h31, [x21, #0x0]\n"
+      "b 298f\n"
+      "286:"  // Height 6: Partial direct writeback: partial_4_16
+      "tbz x16, #2, 288f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x16, #1, 287f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x9], #0x4\n"
+      "st1 { v18.s }[2], [x27], #0x4\n"
+      "st1 { v22.s }[2], [x25], #0x4\n"
+      "st1 { v26.s }[2], [x23], #0x4\n"
+      "st1 { v30.s }[2], [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x9]\n"
+      "st1 { v18.h }[6], [x27]\n"
+      "st1 { v22.h }[6], [x25]\n"
+      "st1 { v26.h }[6], [x23]\n"
+      "st1 { v30.h }[6], [x21]\n"
+      "b 298f\n"
+      "287:"  // Height 6: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 298f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x9]\n"
+      "st1 { v18.h }[4], [x27]\n"
+      "st1 { v22.h }[4], [x25]\n"
+      "st1 { v26.h }[4], [x23]\n"
+      "st1 { v30.h }[4], [x21]\n"
+      "b 298f\n"
+      "288:"  // Height 6: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 289f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x9], #0x4\n"
+      "str s18, [x27], #0x4\n"
+      "str s22, [x25], #0x4\n"
+      "str s26, [x23], #0x4\n"
+      "str s30, [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x9]\n"
+      "st1 { v18.h }[2], [x27]\n"
+      "st1 { v22.h }[2], [x25]\n"
+      "st1 { v26.h }[2], [x23]\n"
+      "st1 { v30.h }[2], [x21]\n"
+      "b 298f\n"
+      "289:"  // Height 6: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 298f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x9, #0x0]\n"
+      "str h18, [x27, #0x0]\n"
+      "str h22, [x25, #0x0]\n"
+      "str h26, [x23, #0x0]\n"
+      "str h30, [x21, #0x0]\n"
+      "b 298f\n"
+      "290:"  // Height 6: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 294f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v20.8h }, [x25], #0x10\n"
+      "st1 { v24.8h }, [x23], #0x10\n"
+      "st1 { v28.8h }, [x21], #0x10\n"
+      "tbz x16, #2, 292f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x16, #1, 291f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x9], #0x4\n"
+      "st1 { v17.s }[2], [x27], #0x4\n"
+      "st1 { v21.s }[2], [x25], #0x4\n"
+      "st1 { v25.s }[2], [x23], #0x4\n"
+      "st1 { v29.s }[2], [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x9]\n"
+      "st1 { v17.h }[6], [x27]\n"
+      "st1 { v21.h }[6], [x25]\n"
+      "st1 { v25.h }[6], [x23]\n"
+      "st1 { v29.h }[6], [x21]\n"
+      "b 298f\n"
+      "291:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 298f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x9]\n"
+      "st1 { v17.h }[4], [x27]\n"
+      "st1 { v21.h }[4], [x25]\n"
+      "st1 { v25.h }[4], [x23]\n"
+      "st1 { v29.h }[4], [x21]\n"
+      "b 298f\n"
+      "292:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 293f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x9], #0x4\n"
+      "str s17, [x27], #0x4\n"
+      "str s21, [x25], #0x4\n"
+      "str s25, [x23], #0x4\n"
+      "str s29, [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x9]\n"
+      "st1 { v17.h }[2], [x27]\n"
+      "st1 { v21.h }[2], [x25]\n"
+      "st1 { v25.h }[2], [x23]\n"
+      "st1 { v29.h }[2], [x21]\n"
+      "b 298f\n"
+      "293:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 298f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x9, #0x0]\n"
+      "str h17, [x27, #0x0]\n"
+      "str h21, [x25, #0x0]\n"
+      "str h25, [x23, #0x0]\n"
+      "str h29, [x21, #0x0]\n"
+      "b 298f\n"
+      "294:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 296f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x16, #1, 295f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x9]\n"
+      "st1 { v16.h }[6], [x27]\n"
+      "st1 { v20.h }[6], [x25]\n"
+      "st1 { v24.h }[6], [x23]\n"
+      "st1 { v28.h }[6], [x21]\n"
+      "b 298f\n"
+      "295:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 298f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x9]\n"
+      "st1 { v16.h }[4], [x27]\n"
+      "st1 { v20.h }[4], [x25]\n"
+      "st1 { v24.h }[4], [x23]\n"
+      "st1 { v28.h }[4], [x21]\n"
+      "b 298f\n"
+      "296:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 297f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x9]\n"
+      "st1 { v16.h }[2], [x27]\n"
+      "st1 { v20.h }[2], [x25]\n"
+      "st1 { v24.h }[2], [x23]\n"
+      "st1 { v28.h }[2], [x21]\n"
+      "b 298f\n"
+      "297:"  // Height 6: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x9, #0x0]\n"
+      "str h16, [x27, #0x0]\n"
+      "str h20, [x25, #0x0]\n"
+      "str h24, [x23, #0x0]\n"
+      "str h28, [x21, #0x0]\n"
+      "298:"  // Height 6: Partial direct writeback: Done
+      "b 300f\n"
+      "299:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "add x21, x21, #0x40\n"
+      "300:"  // Height 6: Writeback done
+      "subs x16, x16, #0x20\n"
+      "bgt 253b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 302f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 301f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "301:"  // Update direct input
+      "mov x19, #0xc\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "302:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
deleted file mode 100644
index 94fcd1064e..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
+++ /dev/null
@@ -1,2427 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = K;
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long blocks_count = K / 1;
-    float nullbias[16];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (16 * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            float result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
-            float *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "temploadreg0 .req X0\n"
-                        "temploadreg1 .req X1\n"
-                        "temploadreg2 .req X2\n"
-                        "temploadreg3 .req X3\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "temploadreg0 .req X2\n"
-                        "temploadreg1 .req X3\n"
-                        "temploadreg2 .req X4\n"
-                        "temploadreg3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "temploadreg0 .req X4\n"
-                        "temploadreg1 .req X5\n"
-                        "temploadreg2 .req X6\n"
-                        "temploadreg3 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "mov v27.16b, v19.16b\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "temploadreg0 .req X6\n"
-                        "temploadreg1 .req X7\n"
-                        "temploadreg2 .req X8\n"
-                        "temploadreg3 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "mov v27.16b, v19.16b\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "mov v28.16b, v16.16b\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "mov v29.16b, v17.16b\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "mov v30.16b, v18.16b\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "mov v31.16b, v19.16b\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v28.4s, v12.4s, v7.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        "fmla v29.4s, v13.4s, v7.s[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "fmla v30.4s, v14.4s, v7.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v31.4s, v15.4s, v7.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "ldr d3, [a_ptr3, #-0x10]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v28.4s, v12.4s, v7.s[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v29.4s, v13.4s, v7.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v30.4s, v14.4s, v7.s[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v31.4s, v15.4s, v7.s[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v28.4s, v12.4s, v7.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v29.4s, v13.4s, v7.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v30.4s, v14.4s, v7.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v31.4s, v15.4s, v7.s[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "fmla v28.4s, v12.4s, v7.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "fmla v29.4s, v13.4s, v7.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "fmla v30.4s, v14.4s, v7.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "fmla v31.4s, v15.4s, v7.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmax v28.4s, v28.4s, v14.4s\n"
-                        "fmax v29.4s, v29.4s, v14.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "fmax v30.4s, v30.4s, v14.4s\n"
-                        "fmin v28.4s, v28.4s, v15.4s\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "fmin v29.4s, v29.4s, v15.4s\n"
-                        "fmax v31.4s, v31.4s, v14.4s\n"
-                        "fmin v30.4s, v30.4s, v15.4s\n"
-                        "str q24, [c_ptr2]\n"
-                        "fmin v31.4s, v31.4s, v15.4s\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
deleted file mode 100644
index 016bef4b9d..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
+++ /dev/null
@@ -1,1802 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = K;
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long blocks_count = K / 1;
-    float nullbias[16];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (16 * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            float result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
-            float *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "mov v27.16b, v19.16b\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "mov v27.16b, v19.16b\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "mov v28.16b, v16.16b\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "mov v29.16b, v17.16b\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "mov v30.16b, v18.16b\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "mov v31.16b, v19.16b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "fmla v28.4s, v12.4s, v7.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "fmla v29.4s, v13.4s, v7.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "fmla v30.4s, v14.4s, v7.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "fmla v31.4s, v15.4s, v7.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "fmla v28.4s, v12.4s, v7.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "fmla v29.4s, v13.4s, v7.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "fmla v30.4s, v14.4s, v7.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "fmla v31.4s, v15.4s, v7.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "fmla v28.4s, v12.4s, v7.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "fmla v29.4s, v13.4s, v7.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "fmla v30.4s, v14.4s, v7.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "fmla v31.4s, v15.4s, v7.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "fmla v28.4s, v12.4s, v7.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "fmla v29.4s, v13.4s, v7.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "fmla v30.4s, v14.4s, v7.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "fmla v31.4s, v15.4s, v7.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmax v28.4s, v28.4s, v14.4s\n"
-                        "fmax v29.4s, v29.4s, v14.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "fmax v30.4s, v30.4s, v14.4s\n"
-                        "fmin v28.4s, v28.4s, v15.4s\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "fmin v29.4s, v29.4s, v15.4s\n"
-                        "fmax v31.4s, v31.4s, v14.4s\n"
-                        "fmin v30.4s, v30.4s, v15.4s\n"
-                        "str q24, [c_ptr2]\n"
-                        "fmin v31.4s, v31.4s, v15.4s\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp
deleted file mode 100644
index 3f1df76a6a..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp
+++ /dev/null
@@ -1,1810 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_16x4_x1(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = K;
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long blocks_count = K / 1;
-    float nullbias[16];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (16 * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            float result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
-            float *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "fmla v20.4s, v8.4s, v5.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "fmla v21.4s, v9.4s, v5.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "fmla v22.4s, v10.4s, v5.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "fmla v23.4s, v11.4s, v5.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "fmla v20.4s, v8.4s, v5.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v21.4s, v9.4s, v5.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v22.4s, v10.4s, v5.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "fmla v23.4s, v11.4s, v5.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "fmla v20.4s, v8.4s, v5.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "fmla v21.4s, v9.4s, v5.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "fmla v22.4s, v10.4s, v5.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "fmla v23.4s, v11.4s, v5.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v20.4s, v8.4s, v5.s[3]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v21.4s, v9.4s, v5.s[3]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v22.4s, v10.4s, v5.s[3]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "fmla v23.4s, v11.4s, v5.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov v27.16b, v19.16b\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v24.4s, v8.4s, v2.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "fmla v25.4s, v9.4s, v2.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "fmla v26.4s, v10.4s, v2.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "fmla v27.4s, v11.4s, v2.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v25.4s, v9.4s, v2.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v26.4s, v10.4s, v2.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v27.4s, v11.4s, v2.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "fmla v20.4s, v8.4s, v5.s[1]\n"
-                        "fmla v24.4s, v8.4s, v6.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "fmla v21.4s, v9.4s, v5.s[1]\n"
-                        "fmla v25.4s, v9.4s, v6.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "fmla v22.4s, v10.4s, v5.s[1]\n"
-                        "fmla v26.4s, v10.4s, v6.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "fmla v23.4s, v11.4s, v5.s[1]\n"
-                        "fmla v27.4s, v11.4s, v6.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "fmla v20.4s, v8.4s, v5.s[3]\n"
-                        "fmla v24.4s, v8.4s, v6.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v21.4s, v9.4s, v5.s[3]\n"
-                        "fmla v25.4s, v9.4s, v6.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v22.4s, v10.4s, v5.s[3]\n"
-                        "fmla v26.4s, v10.4s, v6.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "fmla v23.4s, v11.4s, v5.s[3]\n"
-                        "fmla v27.4s, v11.4s, v6.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "fmla v25.4s, v9.4s, v2.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "fmla v26.4s, v10.4s, v2.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "fmla v27.4s, v11.4s, v2.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v25.4s, v9.4s, v2.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v26.4s, v10.4s, v2.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "fmla v27.4s, v11.4s, v2.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "fmla v20.4s, v8.4s, v5.s[1]\n"
-                        "fmla v24.4s, v8.4s, v6.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "fmla v21.4s, v9.4s, v5.s[1]\n"
-                        "fmla v25.4s, v9.4s, v6.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "fmla v22.4s, v10.4s, v5.s[1]\n"
-                        "fmla v26.4s, v10.4s, v6.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "fmla v23.4s, v11.4s, v5.s[1]\n"
-                        "fmla v27.4s, v11.4s, v6.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v20.4s, v8.4s, v5.s[3]\n"
-                        "fmla v24.4s, v8.4s, v6.s[3]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v21.4s, v9.4s, v5.s[3]\n"
-                        "fmla v25.4s, v9.4s, v6.s[3]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v22.4s, v10.4s, v5.s[3]\n"
-                        "fmla v26.4s, v10.4s, v6.s[3]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "fmla v23.4s, v11.4s, v5.s[3]\n"
-                        "fmla v27.4s, v11.4s, v6.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "fmla v25.4s, v9.4s, v2.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "fmla v26.4s, v10.4s, v2.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "fmla v27.4s, v11.4s, v2.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[3]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v25.4s, v9.4s, v2.s[3]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v26.4s, v10.4s, v2.s[3]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "fmla v27.4s, v11.4s, v2.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "mov v27.16b, v19.16b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov v28.16b, v16.16b\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "mov v29.16b, v17.16b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "mov v30.16b, v18.16b\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov v31.16b, v19.16b\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v24.4s, v8.4s, v2.s[1]\n"
-                        "fmla v28.4s, v8.4s, v3.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "fmla v25.4s, v9.4s, v2.s[1]\n"
-                        "fmla v29.4s, v9.4s, v3.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "fmla v26.4s, v10.4s, v2.s[1]\n"
-                        "fmla v30.4s, v10.4s, v3.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "fmla v27.4s, v11.4s, v2.s[1]\n"
-                        "fmla v31.4s, v11.4s, v3.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[3]\n"
-                        "fmla v28.4s, v8.4s, v3.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v25.4s, v9.4s, v2.s[3]\n"
-                        "fmla v29.4s, v9.4s, v3.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v26.4s, v10.4s, v2.s[3]\n"
-                        "fmla v30.4s, v10.4s, v3.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v27.4s, v11.4s, v2.s[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v31.4s, v11.4s, v3.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "fmla v20.4s, v8.4s, v5.s[1]\n"
-                        "fmla v24.4s, v8.4s, v6.s[1]\n"
-                        "fmla v28.4s, v8.4s, v7.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "fmla v21.4s, v9.4s, v5.s[1]\n"
-                        "fmla v25.4s, v9.4s, v6.s[1]\n"
-                        "fmla v29.4s, v9.4s, v7.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "fmla v22.4s, v10.4s, v5.s[1]\n"
-                        "fmla v26.4s, v10.4s, v6.s[1]\n"
-                        "fmla v30.4s, v10.4s, v7.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "fmla v23.4s, v11.4s, v5.s[1]\n"
-                        "fmla v27.4s, v11.4s, v6.s[1]\n"
-                        "fmla v31.4s, v11.4s, v7.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "fmla v20.4s, v8.4s, v5.s[3]\n"
-                        "fmla v24.4s, v8.4s, v6.s[3]\n"
-                        "fmla v28.4s, v8.4s, v7.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v21.4s, v9.4s, v5.s[3]\n"
-                        "fmla v25.4s, v9.4s, v6.s[3]\n"
-                        "fmla v29.4s, v9.4s, v7.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v22.4s, v10.4s, v5.s[3]\n"
-                        "fmla v26.4s, v10.4s, v6.s[3]\n"
-                        "fmla v30.4s, v10.4s, v7.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "fmla v23.4s, v11.4s, v5.s[3]\n"
-                        "fmla v27.4s, v11.4s, v6.s[3]\n"
-                        "fmla v31.4s, v11.4s, v7.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[1]\n"
-                        "fmla v28.4s, v8.4s, v3.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "fmla v25.4s, v9.4s, v2.s[1]\n"
-                        "fmla v29.4s, v9.4s, v3.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "fmla v26.4s, v10.4s, v2.s[1]\n"
-                        "fmla v30.4s, v10.4s, v3.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "fmla v27.4s, v11.4s, v2.s[1]\n"
-                        "fmla v31.4s, v11.4s, v3.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[3]\n"
-                        "fmla v28.4s, v8.4s, v3.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v25.4s, v9.4s, v2.s[3]\n"
-                        "fmla v29.4s, v9.4s, v3.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v26.4s, v10.4s, v2.s[3]\n"
-                        "fmla v30.4s, v10.4s, v3.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "fmla v27.4s, v11.4s, v2.s[3]\n"
-                        "fmla v31.4s, v11.4s, v3.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "fmla v20.4s, v8.4s, v5.s[1]\n"
-                        "fmla v24.4s, v8.4s, v6.s[1]\n"
-                        "fmla v28.4s, v8.4s, v7.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "fmla v21.4s, v9.4s, v5.s[1]\n"
-                        "fmla v25.4s, v9.4s, v6.s[1]\n"
-                        "fmla v29.4s, v9.4s, v7.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "fmla v22.4s, v10.4s, v5.s[1]\n"
-                        "fmla v26.4s, v10.4s, v6.s[1]\n"
-                        "fmla v30.4s, v10.4s, v7.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "fmla v23.4s, v11.4s, v5.s[1]\n"
-                        "fmla v27.4s, v11.4s, v6.s[1]\n"
-                        "fmla v31.4s, v11.4s, v7.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v20.4s, v8.4s, v5.s[3]\n"
-                        "fmla v24.4s, v8.4s, v6.s[3]\n"
-                        "fmla v28.4s, v8.4s, v7.s[3]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v21.4s, v9.4s, v5.s[3]\n"
-                        "fmla v25.4s, v9.4s, v6.s[3]\n"
-                        "fmla v29.4s, v9.4s, v7.s[3]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v22.4s, v10.4s, v5.s[3]\n"
-                        "fmla v26.4s, v10.4s, v6.s[3]\n"
-                        "fmla v30.4s, v10.4s, v7.s[3]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "fmla v23.4s, v11.4s, v5.s[3]\n"
-                        "fmla v27.4s, v11.4s, v6.s[3]\n"
-                        "fmla v31.4s, v11.4s, v7.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[1]\n"
-                        "fmla v28.4s, v8.4s, v3.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "fmla v25.4s, v9.4s, v2.s[1]\n"
-                        "fmla v29.4s, v9.4s, v3.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "fmla v26.4s, v10.4s, v2.s[1]\n"
-                        "fmla v30.4s, v10.4s, v3.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "fmla v27.4s, v11.4s, v2.s[1]\n"
-                        "fmla v31.4s, v11.4s, v3.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[3]\n"
-                        "fmla v28.4s, v8.4s, v3.s[3]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v25.4s, v9.4s, v2.s[3]\n"
-                        "fmla v29.4s, v9.4s, v3.s[3]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v26.4s, v10.4s, v2.s[3]\n"
-                        "fmla v30.4s, v10.4s, v3.s[3]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "fmla v27.4s, v11.4s, v2.s[3]\n"
-                        "fmla v31.4s, v11.4s, v3.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmax v28.4s, v28.4s, v14.4s\n"
-                        "fmax v29.4s, v29.4s, v14.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "fmax v30.4s, v30.4s, v14.4s\n"
-                        "fmin v28.4s, v28.4s, v15.4s\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "fmin v29.4s, v29.4s, v15.4s\n"
-                        "fmax v31.4s, v31.4s, v14.4s\n"
-                        "fmin v30.4s, v30.4s, v15.4s\n"
-                        "str q24, [c_ptr2]\n"
-                        "fmin v31.4s, v31.4s, v15.4s\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp
deleted file mode 100644
index 7442d258ec..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp
+++ /dev/null
@@ -1,1934 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = K;
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long blocks_count = K / 1;
-    float nullbias[4];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (4 * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 8) {
-            if (rows_to_compute % 8) {
-                rows_to_compute = 8 - 1;
-            } else {
-                rows_to_compute = 8;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=4ul) {
-            const long width = std::min((unsigned long)N-x0, 4ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 4);
-            float result_buffer[32];
-            const unsigned long ldcb = (use_result_buffer ? 4 : ldc) * sizeof(float);
-            float *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 8); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 4 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "ldr q24, [%[biasptr]]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "str q25, [c_ptr1]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v26.16b, v24.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmax v26.4s, v26.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "fmin v26.4s, v26.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "str q25, [c_ptr1]\n"
-                        "str q26, [c_ptr2]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v26.16b, v24.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v27.16b, v24.16b\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmax v26.4s, v26.4s, v22.4s\n"
-                        "fmax v27.4s, v27.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "fmin v26.4s, v26.4s, v23.4s\n"
-                        "fmin v27.4s, v27.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "str q25, [c_ptr1]\n"
-                        "str q26, [c_ptr2]\n"
-                        "str q27, [c_ptr3]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-                case 5:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "c_ptr1 .req X4\n"
-                        "c_ptr2 .req X5\n"
-                        "c_ptr3 .req X6\n"
-                        "c_ptr4 .req X7\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v26.16b, v24.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v27.16b, v24.16b\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "mov v28.16b, v24.16b\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "ldr q4, [a_ptr4]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "ldr q4, [a_ptr4, #-0x10]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "prfm PSTL1KEEP, [c_ptr4]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "ldr s4, [a_ptr4]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "add a_ptr4, a_ptr4, #0x4\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmax v26.4s, v26.4s, v22.4s\n"
-                        "fmax v27.4s, v27.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "fmin v26.4s, v26.4s, v23.4s\n"
-                        "fmin v27.4s, v27.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "fmax v28.4s, v28.4s, v22.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "str q25, [c_ptr1]\n"
-                        "fmin v28.4s, v28.4s, v23.4s\n"
-                        "str q26, [c_ptr2]\n"
-                        "str q27, [c_ptr3]\n"
-                        "str q28, [c_ptr4]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
-                    );
-                    break;
-                case 6:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "c_ptr1 .req X5\n"
-                        "c_ptr2 .req X6\n"
-                        "c_ptr3 .req X7\n"
-                        "c_ptr4 .req X8\n"
-                        "c_ptr5 .req X9\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v26.16b, v24.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v27.16b, v24.16b\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "mov v28.16b, v24.16b\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "mov v29.16b, v24.16b\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "ldr q4, [a_ptr4]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "ldr q5, [a_ptr5]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q13, [a_ptr5]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "ldr q4, [a_ptr4, #-0x10]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "ldr q5, [a_ptr5, #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v29.4s, v16.4s, v13.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v29.4s, v17.4s, v13.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v29.4s, v18.4s, v13.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "fmla v29.4s, v19.4s, v13.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "prfm PSTL1KEEP, [c_ptr4]\n"
-                        "prfm PSTL1KEEP, [c_ptr5]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "ldr q13, [a_ptr5]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v29.4s, v16.4s, v13.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v29.4s, v17.4s, v13.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v29.4s, v18.4s, v13.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "fmla v29.4s, v19.4s, v13.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "ldr s4, [a_ptr4]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "add a_ptr4, a_ptr4, #0x4\n"
-                        "ldr s5, [a_ptr5]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "add a_ptr5, a_ptr5, #0x4\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmax v26.4s, v26.4s, v22.4s\n"
-                        "fmax v27.4s, v27.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "fmin v26.4s, v26.4s, v23.4s\n"
-                        "fmin v27.4s, v27.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "fmax v28.4s, v28.4s, v22.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "fmax v29.4s, v29.4s, v22.4s\n"
-                        "str q25, [c_ptr1]\n"
-                        "fmin v28.4s, v28.4s, v23.4s\n"
-                        "fmin v29.4s, v29.4s, v23.4s\n"
-                        "str q26, [c_ptr2]\n"
-                        "str q27, [c_ptr3]\n"
-                        "str q28, [c_ptr4]\n"
-                        "str q29, [c_ptr5]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
-                    );
-                    break;
-                case 7:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "a_ptr6 .req X5\n"
-                        "c_ptr1 .req X6\n"
-                        "c_ptr2 .req X7\n"
-                        "c_ptr3 .req X8\n"
-                        "c_ptr4 .req X9\n"
-                        "c_ptr5 .req X10\n"
-                        "c_ptr6 .req X11\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v26.16b, v24.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v27.16b, v24.16b\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "mov v28.16b, v24.16b\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "mov v29.16b, v24.16b\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "mov v30.16b, v24.16b\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "ldr q4, [a_ptr4]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "ldr q5, [a_ptr5]\n"
-                        "add a_ptr6, a_ptr5, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "ldr q6, [a_ptr6]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add c_ptr6, c_ptr5, %[ldc]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "ldr q13, [a_ptr5]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q14, [a_ptr6]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v30.4s, v17.4s, v6.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "add a_ptr6, a_ptr6, #0x20\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v30.4s, v18.4s, v6.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "ldr q4, [a_ptr4, #-0x10]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "ldr q5, [a_ptr5, #-0x10]\n"
-                        "fmla v30.4s, v19.4s, v6.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "ldr q6, [a_ptr6, #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v29.4s, v16.4s, v13.s[0]\n"
-                        "fmla v30.4s, v16.4s, v14.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v29.4s, v17.4s, v13.s[1]\n"
-                        "fmla v30.4s, v17.4s, v14.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v29.4s, v18.4s, v13.s[2]\n"
-                        "fmla v30.4s, v18.4s, v14.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "fmla v29.4s, v19.4s, v13.s[3]\n"
-                        "fmla v30.4s, v19.4s, v14.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "prfm PSTL1KEEP, [c_ptr4]\n"
-                        "prfm PSTL1KEEP, [c_ptr5]\n"
-                        "prfm PSTL1KEEP, [c_ptr6]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "ldr q13, [a_ptr5]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "ldr q14, [a_ptr6]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "fmla v30.4s, v17.4s, v6.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "fmla v30.4s, v18.4s, v6.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "fmla v30.4s, v19.4s, v6.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v29.4s, v16.4s, v13.s[0]\n"
-                        "fmla v30.4s, v16.4s, v14.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v29.4s, v17.4s, v13.s[1]\n"
-                        "fmla v30.4s, v17.4s, v14.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v29.4s, v18.4s, v13.s[2]\n"
-                        "fmla v30.4s, v18.4s, v14.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "fmla v29.4s, v19.4s, v13.s[3]\n"
-                        "fmla v30.4s, v19.4s, v14.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "fmla v30.4s, v17.4s, v6.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "fmla v30.4s, v18.4s, v6.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "fmla v30.4s, v19.4s, v6.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "ldr s4, [a_ptr4]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "add a_ptr4, a_ptr4, #0x4\n"
-                        "ldr s5, [a_ptr5]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "add a_ptr5, a_ptr5, #0x4\n"
-                        "ldr s6, [a_ptr6]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "add a_ptr6, a_ptr6, #0x4\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmax v26.4s, v26.4s, v22.4s\n"
-                        "fmax v27.4s, v27.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "fmin v26.4s, v26.4s, v23.4s\n"
-                        "fmin v27.4s, v27.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "fmax v28.4s, v28.4s, v22.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "fmax v29.4s, v29.4s, v22.4s\n"
-                        "str q25, [c_ptr1]\n"
-                        "fmax v30.4s, v30.4s, v22.4s\n"
-                        "fmin v28.4s, v28.4s, v23.4s\n"
-                        "fmin v29.4s, v29.4s, v23.4s\n"
-                        "str q26, [c_ptr2]\n"
-                        "fmin v30.4s, v30.4s, v23.4s\n"
-                        "str q27, [c_ptr3]\n"
-                        "str q28, [c_ptr4]\n"
-                        "str q29, [c_ptr5]\n"
-                        "str q30, [c_ptr6]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq a_ptr6\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        ".unreq c_ptr6\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 8:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "a_ptr6 .req X5\n"
-                        "a_ptr7 .req X6\n"
-                        "c_ptr1 .req X7\n"
-                        "c_ptr2 .req X8\n"
-                        "c_ptr3 .req X9\n"
-                        "c_ptr4 .req X10\n"
-                        "c_ptr5 .req X11\n"
-                        "c_ptr6 .req X12\n"
-                        "c_ptr7 .req X13\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v26.16b, v24.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v27.16b, v24.16b\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "mov v28.16b, v24.16b\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "mov v29.16b, v24.16b\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "mov v30.16b, v24.16b\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "mov v31.16b, v24.16b\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "ldr q4, [a_ptr4]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "ldr q5, [a_ptr5]\n"
-                        "add a_ptr6, a_ptr5, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "ldr q6, [a_ptr6]\n"
-                        "add a_ptr7, a_ptr6, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "ldr q7, [a_ptr7]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add c_ptr6, c_ptr5, %[ldc]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add c_ptr7, c_ptr6, %[ldc]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "add a_ptr7, a_ptr7, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "ldr q13, [a_ptr5]\n"
-                        "fmla v31.4s, v16.4s, v7.s[0]\n"
-                        "ldr q14, [a_ptr6]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q15, [a_ptr7]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v30.4s, v17.4s, v6.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v31.4s, v17.4s, v7.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "add a_ptr6, a_ptr6, #0x20\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "add a_ptr7, a_ptr7, #0x20\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v30.4s, v18.4s, v6.s[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v31.4s, v18.4s, v7.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "ldr q4, [a_ptr4, #-0x10]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "ldr q5, [a_ptr5, #-0x10]\n"
-                        "fmla v30.4s, v19.4s, v6.s[3]\n"
-                        "ldr q6, [a_ptr6, #-0x10]\n"
-                        "fmla v31.4s, v19.4s, v7.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "ldr q7, [a_ptr7, #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v29.4s, v16.4s, v13.s[0]\n"
-                        "fmla v30.4s, v16.4s, v14.s[0]\n"
-                        "fmla v31.4s, v16.4s, v15.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v29.4s, v17.4s, v13.s[1]\n"
-                        "fmla v30.4s, v17.4s, v14.s[1]\n"
-                        "fmla v31.4s, v17.4s, v15.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v29.4s, v18.4s, v13.s[2]\n"
-                        "fmla v30.4s, v18.4s, v14.s[2]\n"
-                        "fmla v31.4s, v18.4s, v15.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "fmla v29.4s, v19.4s, v13.s[3]\n"
-                        "fmla v30.4s, v19.4s, v14.s[3]\n"
-                        "fmla v31.4s, v19.4s, v15.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "prfm PSTL1KEEP, [c_ptr4]\n"
-                        "prfm PSTL1KEEP, [c_ptr5]\n"
-                        "prfm PSTL1KEEP, [c_ptr6]\n"
-                        "prfm PSTL1KEEP, [c_ptr7]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "ldr q13, [a_ptr5]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "ldr q14, [a_ptr6]\n"
-                        "fmla v31.4s, v16.4s, v7.s[0]\n"
-                        "ldr q15, [a_ptr7]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "fmla v30.4s, v17.4s, v6.s[1]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "fmla v31.4s, v17.4s, v7.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr7, a_ptr7, #0x10\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "fmla v30.4s, v18.4s, v6.s[2]\n"
-                        "fmla v31.4s, v18.4s, v7.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "fmla v30.4s, v19.4s, v6.s[3]\n"
-                        "fmla v31.4s, v19.4s, v7.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v29.4s, v16.4s, v13.s[0]\n"
-                        "fmla v30.4s, v16.4s, v14.s[0]\n"
-                        "fmla v31.4s, v16.4s, v15.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v29.4s, v17.4s, v13.s[1]\n"
-                        "fmla v30.4s, v17.4s, v14.s[1]\n"
-                        "fmla v31.4s, v17.4s, v15.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v29.4s, v18.4s, v13.s[2]\n"
-                        "fmla v30.4s, v18.4s, v14.s[2]\n"
-                        "fmla v31.4s, v18.4s, v15.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "fmla v29.4s, v19.4s, v13.s[3]\n"
-                        "fmla v30.4s, v19.4s, v14.s[3]\n"
-                        "fmla v31.4s, v19.4s, v15.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "fmla v31.4s, v16.4s, v7.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "fmla v30.4s, v17.4s, v6.s[1]\n"
-                        "fmla v31.4s, v17.4s, v7.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "fmla v30.4s, v18.4s, v6.s[2]\n"
-                        "fmla v31.4s, v18.4s, v7.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "fmla v30.4s, v19.4s, v6.s[3]\n"
-                        "fmla v31.4s, v19.4s, v7.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "ldr s4, [a_ptr4]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "add a_ptr4, a_ptr4, #0x4\n"
-                        "ldr s5, [a_ptr5]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "add a_ptr5, a_ptr5, #0x4\n"
-                        "ldr s6, [a_ptr6]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "add a_ptr6, a_ptr6, #0x4\n"
-                        "ldr s7, [a_ptr7]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "add a_ptr7, a_ptr7, #0x4\n"
-                        "fmla v31.4s, v16.4s, v7.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmax v26.4s, v26.4s, v22.4s\n"
-                        "fmax v27.4s, v27.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "fmin v26.4s, v26.4s, v23.4s\n"
-                        "fmin v27.4s, v27.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "fmax v28.4s, v28.4s, v22.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "fmax v29.4s, v29.4s, v22.4s\n"
-                        "str q25, [c_ptr1]\n"
-                        "fmax v30.4s, v30.4s, v22.4s\n"
-                        "fmin v28.4s, v28.4s, v23.4s\n"
-                        "fmax v31.4s, v31.4s, v22.4s\n"
-                        "str q26, [c_ptr2]\n"
-                        "fmin v29.4s, v29.4s, v23.4s\n"
-                        "fmin v30.4s, v30.4s, v23.4s\n"
-                        "fmin v31.4s, v31.4s, v23.4s\n"
-                        "str q27, [c_ptr3]\n"
-                        "str q28, [c_ptr4]\n"
-                        "str q29, [c_ptr5]\n"
-                        "str q30, [c_ptr6]\n"
-                        "str q31, [c_ptr7]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq a_ptr6\n"
-                        ".unreq a_ptr7\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        ".unreq c_ptr6\n"
-                        ".unreq c_ptr7\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 8); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 4 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
index 4147ab60dc..e0c61e4113 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,44 +10,49 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
 
 #include "../performance_parameters.hpp"
 #include "../std_transforms_fixed.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<float>, \
+   size_t, size_t, \
+   const float *, \
+   IndirectOutputArg<float>, \
+   const float *, Activation, bool
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_hybrid_fp32_mla_16x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-void a64_hybrid_fp32_mla_16x4_a55(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-void a64_hybrid_fp32_mla_16x4_x1(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_hybrid_fp32_mla_6x16( ARGLIST );
 
-class hybrid_fp32_mla_16x4
+class cls_a64_hybrid_fp32_mla_6x16
 {
 public:
     typedef float operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
     {
-        return 4;
+        return 6;
     }
 
     static unsigned int out_width()
@@ -65,47 +70,33 @@ public:
         return true;
     }
 
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
     static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
         switch (ci->get_cpu_model()) {
             case CPUModel::A55r1:
-                return { 2.866 };
+                return { 2.00 };
 
             case CPUModel::A53:
-                return { 1.419 };
+                return { 1.43 };
 
             case CPUModel::A73:
-                return { 2.551 };
+                return { 2.56 };
 
             default:
-                return { 6.25 };
+                return { 6.26 };
         }
     }
 
-    StdTransformsFixed<operand_type, result_type, 4, 16, 1> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 6, 16, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_hybrid_fp32_mla_16x4;
+    kern_type kernel=a64_hybrid_fp32_mla_6x16;
 
-    hybrid_fp32_mla_16x4(const CPUInfo *ci)
+    cls_a64_hybrid_fp32_mla_6x16(const CPUInfo *)
     {
-        if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_hybrid_fp32_mla_16x4_a55;
-        } else if (ci->get_cpu_model() == CPUModel::X1) {
-            kernel = a64_hybrid_fp32_mla_16x4_x1;
-        }
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
new file mode 100644
index 0000000000..884e8986c8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
@@ -0,0 +1,3430 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 171f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 137f\n"
+      "beq 103f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 69f\n"
+      "beq 35f\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "cbz x14, 4f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "add x14, x14, #0x40\n"
+      "b 15f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 14f\n"
+      "cmp x16, #0x10\n"
+      "bge 13f\n"
+      "tbz x16, #3, 8f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "tbz x16, #2, 6f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 5f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "b 12f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 12f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "b 12f\n"
+      "6:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x16, #1, 7f\n"
+      "ldr d10, [x13], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "b 12f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 12f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "b 12f\n"
+      "8:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x16, #2, 10f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 9f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "b 12f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 12f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "b 12f\n"
+      "10:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x16, #1, 11f\n"
+      "ldr d8, [x13], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "b 12f\n"
+      "11:"  // Height 1: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "12:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 15f\n"
+      "13:"  // Height 1: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "b 15f\n"
+      "14:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "15:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "16:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 17f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 18f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "b 18f\n"
+      "17:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "18:"  // Height 1: input setup done
+      "cmp x11, #0x4\n"
+      "blt 21f\n"
+      "cmp x11, #0x8\n"
+      "blt 20f\n"
+      "19:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "cmp x11, #0x8\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "bge 19b\n"
+      "20:"  // Height 1: Multiply loop: Single iteration only
+      "sub x11, x11, #0x4\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "21:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x11, 23f\n"
+      "22:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "sub x11, x11, #0x1\n"
+      "add x15, x15, #0x40\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "cbnz x11, 22b\n"
+      "23:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 16b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 24f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "24:"  // Height 1: No activation
+      "cmp x16, #0x10\n"
+      "bge 33f\n"
+      "tbz x16, #3, 28f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "tbz x16, #2, 26f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 25f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x16, #0, 32f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "b 32f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 32f\n"
+      "str s11, [x13, #0x0]\n"
+      "b 32f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 27f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x16, #0, 32f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "b 32f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 32f\n"
+      "str s10, [x13, #0x0]\n"
+      "b 32f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 30f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 29f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x16, #0, 32f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "b 32f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 32f\n"
+      "str s9, [x13, #0x0]\n"
+      "b 32f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 31f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x16, #0, 32f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "b 32f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "32:"  // Height 1: Partial direct writeback: Done
+      "b 34f\n"
+      "33:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "34:"  // Height 1: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 3b\n"
+      "b 206f\n"
+      "35:"  // Height 2
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 36f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 37f\n"
+      "36:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "37:"  // Height 2: Column loop
+      "cbz x14, 38f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v15.16b, v11.16b\n"
+      "b 49f\n"
+      "38:"  // Height 2: no bias
+      "tbz %x[flags], #0, 48f\n"
+      "cmp x16, #0x10\n"
+      "bge 47f\n"
+      "tbz x16, #3, 42f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "tbz x16, #2, 40f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 39f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "tbz x16, #0, 46f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "b 46f\n"
+      "39:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 46f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "b 46f\n"
+      "40:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x16, #1, 41f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 46f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "b 46f\n"
+      "41:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 46f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "b 46f\n"
+      "42:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x16, #2, 44f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 43f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "tbz x16, #0, 46f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "b 46f\n"
+      "43:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 46f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "b 46f\n"
+      "44:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x16, #1, 45f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 46f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "b 46f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "46:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "b 49f\n"
+      "47:"  // Height 2: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "b 49f\n"
+      "48:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "49:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "50:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 51f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 52f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "b 52f\n"
+      "51:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "52:"  // Height 2: input setup done
+      "cmp x11, #0x4\n"
+      "blt 55f\n"
+      "cmp x11, #0x8\n"
+      "blt 54f\n"
+      "53:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "cmp x11, #0x8\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "bge 53b\n"
+      "54:"  // Height 2: Multiply loop: Single iteration only
+      "sub x11, x11, #0x4\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "55:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x11, 57f\n"
+      "56:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "cbnz x11, 56b\n"
+      "57:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 50b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 58f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "58:"  // Height 2: No activation
+      "cmp x16, #0x10\n"
+      "bge 67f\n"
+      "tbz x16, #3, 62f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "tbz x16, #2, 60f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 59f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "tbz x16, #0, 66f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "b 66f\n"
+      "59:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 66f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "b 66f\n"
+      "60:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 61f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "tbz x16, #0, 66f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "b 66f\n"
+      "61:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 66f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "b 66f\n"
+      "62:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 64f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 63f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "tbz x16, #0, 66f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "b 66f\n"
+      "63:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 66f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "b 66f\n"
+      "64:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 65f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "tbz x16, #0, 66f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "b 66f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "66:"  // Height 2: Partial direct writeback: Done
+      "b 68f\n"
+      "67:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "68:"  // Height 2: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 37b\n"
+      "b 206f\n"
+      "69:"  // Height 3
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 70f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 71f\n"
+      "70:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "71:"  // Height 3: Column loop
+      "cbz x14, 72f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v13.16b, v9.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "b 83f\n"
+      "72:"  // Height 3: no bias
+      "tbz %x[flags], #0, 82f\n"
+      "cmp x16, #0x10\n"
+      "bge 81f\n"
+      "tbz x16, #3, 76f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "tbz x16, #2, 74f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 73f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "b 80f\n"
+      "73:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 80f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "b 80f\n"
+      "74:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x16, #1, 75f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 80f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "b 80f\n"
+      "75:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 80f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "b 80f\n"
+      "76:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x16, #2, 78f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 77f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "b 80f\n"
+      "77:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 80f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "b 80f\n"
+      "78:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x16, #1, 79f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "b 80f\n"
+      "79:"  // Height 3: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "80:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "b 83f\n"
+      "81:"  // Height 3: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "b 83f\n"
+      "82:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "83:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "84:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 86f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "b 86f\n"
+      "85:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "86:"  // Height 3: input setup done
+      "cmp x11, #0x4\n"
+      "blt 89f\n"
+      "cmp x11, #0x8\n"
+      "blt 88f\n"
+      "87:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "cmp x11, #0x8\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "bge 87b\n"
+      "88:"  // Height 3: Multiply loop: Single iteration only
+      "sub x11, x11, #0x4\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "89:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x11, 91f\n"
+      "90:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "cbnz x11, 90b\n"
+      "91:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 84b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 92f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "92:"  // Height 3: No activation
+      "cmp x16, #0x10\n"
+      "bge 101f\n"
+      "tbz x16, #3, 96f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "tbz x16, #2, 94f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 93f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "tbz x16, #0, 100f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "b 100f\n"
+      "93:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 100f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "b 100f\n"
+      "94:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 95f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "tbz x16, #0, 100f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "b 100f\n"
+      "95:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 100f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "b 100f\n"
+      "96:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 98f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 97f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "tbz x16, #0, 100f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "b 100f\n"
+      "97:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 100f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "b 100f\n"
+      "98:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 99f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "tbz x16, #0, 100f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "b 100f\n"
+      "99:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "100:"  // Height 3: Partial direct writeback: Done
+      "b 102f\n"
+      "101:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "102:"  // Height 3: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 71b\n"
+      "b 206f\n"
+      "103:"  // Height 4
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 104f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 105f\n"
+      "104:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "105:"  // Height 4: Column loop
+      "cbz x14, 106f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "add x14, x14, #0x40\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "b 117f\n"
+      "106:"  // Height 4: no bias
+      "tbz %x[flags], #0, 116f\n"
+      "cmp x16, #0x10\n"
+      "bge 115f\n"
+      "tbz x16, #3, 110f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "tbz x16, #2, 108f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 107f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "tbz x16, #0, 114f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "b 114f\n"
+      "107:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 114f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "b 114f\n"
+      "108:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x16, #1, 109f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 114f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "b 114f\n"
+      "109:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 114f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "b 114f\n"
+      "110:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x16, #2, 112f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 111f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "tbz x16, #0, 114f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "b 114f\n"
+      "111:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 114f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "b 114f\n"
+      "112:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x16, #1, 113f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 114f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "b 114f\n"
+      "113:"  // Height 4: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "114:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "b 117f\n"
+      "115:"  // Height 4: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "b 117f\n"
+      "116:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "117:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "118:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 119f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 120f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "b 120f\n"
+      "119:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "120:"  // Height 4: input setup done
+      "cmp x11, #0x4\n"
+      "blt 123f\n"
+      "cmp x11, #0x8\n"
+      "blt 122f\n"
+      "121:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "cmp x11, #0x8\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "bge 121b\n"
+      "122:"  // Height 4: Multiply loop: Single iteration only
+      "sub x11, x11, #0x4\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "123:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x11, 125f\n"
+      "124:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "cbnz x11, 124b\n"
+      "125:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 118b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 126f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "126:"  // Height 4: No activation
+      "cmp x16, #0x10\n"
+      "bge 135f\n"
+      "tbz x16, #3, 130f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "tbz x16, #2, 128f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 127f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "tbz x16, #0, 134f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "b 134f\n"
+      "127:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 134f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "b 134f\n"
+      "128:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 129f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "tbz x16, #0, 134f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "b 134f\n"
+      "129:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 134f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "b 134f\n"
+      "130:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 132f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 131f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "tbz x16, #0, 134f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "b 134f\n"
+      "131:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 134f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "b 134f\n"
+      "132:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 133f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x16, #0, 134f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "b 134f\n"
+      "133:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "134:"  // Height 4: Partial direct writeback: Done
+      "b 136f\n"
+      "135:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "136:"  // Height 4: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 105b\n"
+      "b 206f\n"
+      "137:"  // Height 5
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 138f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 139f\n"
+      "138:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "139:"  // Height 5: Column loop
+      "cbz x14, 140f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v24.16b, v8.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "b 151f\n"
+      "140:"  // Height 5: no bias
+      "tbz %x[flags], #0, 150f\n"
+      "cmp x16, #0x10\n"
+      "bge 149f\n"
+      "tbz x16, #3, 144f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 142f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 141f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "tbz x16, #0, 148f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "b 148f\n"
+      "141:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 148f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "b 148f\n"
+      "142:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x16, #1, 143f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 148f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "b 148f\n"
+      "143:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 148f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "b 148f\n"
+      "144:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x16, #2, 146f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 145f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "tbz x16, #0, 148f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "b 148f\n"
+      "145:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 148f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "b 148f\n"
+      "146:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x16, #1, 147f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 148f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "b 148f\n"
+      "147:"  // Height 5: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "148:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "b 151f\n"
+      "149:"  // Height 5: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "b 151f\n"
+      "150:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "151:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "152:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 153f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 154f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 154f\n"
+      "153:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "154:"  // Height 5: input setup done
+      "cmp x11, #0x4\n"
+      "blt 157f\n"
+      "cmp x11, #0x8\n"
+      "blt 156f\n"
+      "155:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "cmp x11, #0x8\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "fmla v24.4s, v6.4s, v4.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "fmla v25.4s, v7.4s, v4.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "fmla v26.4s, v6.4s, v4.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "fmla v27.4s, v7.4s, v4.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "fmla v24.4s, v6.4s, v4.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "fmla v25.4s, v7.4s, v4.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "fmla v26.4s, v6.4s, v4.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "fmla v27.4s, v7.4s, v4.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "fmla v24.4s, v6.4s, v4.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "fmla v25.4s, v7.4s, v4.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "bge 155b\n"
+      "156:"  // Height 5: Multiply loop: Single iteration only
+      "sub x11, x11, #0x4\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "fmla v24.4s, v6.4s, v4.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "fmla v25.4s, v7.4s, v4.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "fmla v26.4s, v6.4s, v4.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "fmla v27.4s, v7.4s, v4.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "fmla v24.4s, v6.4s, v4.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "fmla v25.4s, v7.4s, v4.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "fmla v26.4s, v6.4s, v4.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "fmla v27.4s, v7.4s, v4.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "fmla v24.4s, v6.4s, v4.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "fmla v25.4s, v7.4s, v4.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "157:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x11, 159f\n"
+      "158:"  // Height 5: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "cbnz x11, 158b\n"
+      "159:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 152b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 160f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "160:"  // Height 5: No activation
+      "cmp x16, #0x10\n"
+      "bge 169f\n"
+      "tbz x16, #3, 164f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 162f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 161f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "tbz x16, #0, 168f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "b 168f\n"
+      "161:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 168f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "b 168f\n"
+      "162:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 163f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "tbz x16, #0, 168f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "b 168f\n"
+      "163:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 168f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "b 168f\n"
+      "164:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 166f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 165f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "tbz x16, #0, 168f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "b 168f\n"
+      "165:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 168f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "b 168f\n"
+      "166:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 167f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x16, #0, 168f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "b 168f\n"
+      "167:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "168:"  // Height 5: Partial direct writeback: Done
+      "b 170f\n"
+      "169:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "170:"  // Height 5: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 139b\n"
+      "b 206f\n"
+      "171:"  // Height 6
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 172f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 173f\n"
+      "172:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "173:"  // Height 6: Column loop
+      "cbz x14, 174f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v24.16b, v8.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v28.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v29.16b, v9.16b\n"
+      "mov v30.16b, v10.16b\n"
+      "mov v31.16b, v11.16b\n"
+      "b 185f\n"
+      "174:"  // Height 6: no bias
+      "tbz %x[flags], #0, 184f\n"
+      "cmp x16, #0x10\n"
+      "bge 183f\n"
+      "tbz x16, #3, 178f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 176f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 175f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x16, #0, 182f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 182f\n"
+      "175:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 182f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 182f\n"
+      "176:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x16, #1, 177f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 182f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 182f\n"
+      "177:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 182f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 182f\n"
+      "178:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x16, #2, 180f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 179f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x16, #0, 182f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 182f\n"
+      "179:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 182f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 182f\n"
+      "180:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x16, #1, 181f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 182f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 182f\n"
+      "181:"  // Height 6: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "182:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "sub x21, x21, x19\n"
+      "b 185f\n"
+      "183:"  // Height 6: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "b 185f\n"
+      "184:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "185:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "186:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 187f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 188f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 188f\n"
+      "187:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "add x20, x22, x19, LSL #2\n"
+      "188:"  // Height 6: input setup done
+      "cmp x11, #0x4\n"
+      "blt 191f\n"
+      "cmp x11, #0x8\n"
+      "blt 190f\n"
+      "189:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v28.4s, v6.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "cmp x11, #0x8\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "fmla v29.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "fmla v30.4s, v6.4s, v5.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "fmla v31.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "fmla v24.4s, v6.4s, v4.s[1]\n"
+      "fmla v28.4s, v6.4s, v5.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "fmla v25.4s, v7.4s, v4.s[1]\n"
+      "fmla v29.4s, v7.4s, v5.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "fmla v26.4s, v6.4s, v4.s[1]\n"
+      "fmla v30.4s, v6.4s, v5.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "fmla v27.4s, v7.4s, v4.s[1]\n"
+      "fmla v31.4s, v7.4s, v5.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "fmla v24.4s, v6.4s, v4.s[2]\n"
+      "fmla v28.4s, v6.4s, v5.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "fmla v25.4s, v7.4s, v4.s[2]\n"
+      "fmla v29.4s, v7.4s, v5.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "fmla v26.4s, v6.4s, v4.s[2]\n"
+      "fmla v30.4s, v6.4s, v5.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "fmla v27.4s, v7.4s, v4.s[2]\n"
+      "fmla v31.4s, v7.4s, v5.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "fmla v24.4s, v6.4s, v4.s[3]\n"
+      "fmla v28.4s, v6.4s, v5.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "fmla v25.4s, v7.4s, v4.s[3]\n"
+      "fmla v29.4s, v7.4s, v5.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v30.4s, v6.4s, v5.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "fmla v31.4s, v7.4s, v5.s[3]\n"
+      "bge 189b\n"
+      "190:"  // Height 6: Multiply loop: Single iteration only
+      "sub x11, x11, #0x4\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v28.4s, v6.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "fmla v29.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "fmla v30.4s, v6.4s, v5.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "fmla v31.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "fmla v24.4s, v6.4s, v4.s[1]\n"
+      "fmla v28.4s, v6.4s, v5.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "fmla v25.4s, v7.4s, v4.s[1]\n"
+      "fmla v29.4s, v7.4s, v5.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "fmla v26.4s, v6.4s, v4.s[1]\n"
+      "fmla v30.4s, v6.4s, v5.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "fmla v27.4s, v7.4s, v4.s[1]\n"
+      "fmla v31.4s, v7.4s, v5.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "fmla v24.4s, v6.4s, v4.s[2]\n"
+      "fmla v28.4s, v6.4s, v5.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "fmla v25.4s, v7.4s, v4.s[2]\n"
+      "fmla v29.4s, v7.4s, v5.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "fmla v26.4s, v6.4s, v4.s[2]\n"
+      "fmla v30.4s, v6.4s, v5.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "fmla v27.4s, v7.4s, v4.s[2]\n"
+      "fmla v31.4s, v7.4s, v5.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "fmla v24.4s, v6.4s, v4.s[3]\n"
+      "fmla v28.4s, v6.4s, v5.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "fmla v25.4s, v7.4s, v4.s[3]\n"
+      "fmla v29.4s, v7.4s, v5.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v30.4s, v6.4s, v5.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "fmla v31.4s, v7.4s, v5.s[3]\n"
+      "191:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x11, 193f\n"
+      "192:"  // Height 6: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x20], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "fmla v28.4s, v6.4s, v5.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "fmla v29.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "fmla v30.4s, v6.4s, v5.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "fmla v31.4s, v7.4s, v5.s[0]\n"
+      "cbnz x11, 192b\n"
+      "193:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 186b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 194f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmin v28.4s, v28.4s, v0.4s\n"
+      "fmin v29.4s, v29.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "fmax v28.4s, v28.4s, v1.4s\n"
+      "fmax v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v0.4s\n"
+      "fmin v31.4s, v31.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v1.4s\n"
+      "fmax v31.4s, v31.4s, v1.4s\n"
+      "194:"  // Height 6: No activation
+      "cmp x16, #0x10\n"
+      "bge 203f\n"
+      "tbz x16, #3, 198f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 196f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 195f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x16, #0, 202f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 202f\n"
+      "195:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 202f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "b 202f\n"
+      "196:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 197f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x16, #0, 202f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "b 202f\n"
+      "197:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 202f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "b 202f\n"
+      "198:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 200f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 199f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x16, #0, 202f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "b 202f\n"
+      "199:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 202f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "b 202f\n"
+      "200:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 201f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x16, #0, 202f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "b 202f\n"
+      "201:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "202:"  // Height 6: Partial direct writeback: Done
+      "b 204f\n"
+      "203:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "add x21, x21, #0x40\n"
+      "204:"  // Height 6: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 173b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 206f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 205f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "205:"  // Update direct input
+      "mov x19, #0x18\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "206:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
new file mode 100644
index 0000000000..043d0643f0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<float>, \
+   size_t, size_t, \
+   const float *, \
+   IndirectOutputArg<float>, \
+   const float *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_fp32_mla_8x4( ARGLIST );
+
+class cls_a64_hybrid_fp32_mla_8x4
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
+    static unsigned int out_width()
+    {
+        return 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_fp32_mla_8x4;
+
+    cls_a64_hybrid_fp32_mla_8x4(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
new file mode 100644
index 0000000000..3ab6cad368
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
@@ -0,0 +1,2195 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_8x4 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x8\n"
+      "bge 155f\n"
+      "cmp %x[M], #0x6\n"
+      "bgt 133f\n"
+      "beq 111f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 89f\n"
+      "beq 67f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 45f\n"
+      "beq 23f\n"
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "cbz x8, 4f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "add x8, x8, #0x10\n"
+      "b 9f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 8f\n"
+      "cmp x6, #0x4\n"
+      "bge 7f\n"
+      "tbz x6, #1, 5f\n"
+      "ldr d24, [x17], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 6f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "b 6f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "6:"  // Height 1: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "b 9f\n"
+      "7:"  // Height 1: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "b 9f\n"
+      "8:"  // Height 1: no accumulate
+      "movi v24.16b, #0x0\n"
+      "9:"  // Height 1: setup done
+      "mov x16, #0x0\n"
+      "10:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 11f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "cbnz x16, 12f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "b 12f\n"
+      "11:"  // Height 1: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "12:"  // Height 1: input setup done
+      "cmp x15, #0x4\n"
+      "blt 15f\n"
+      "cmp x15, #0x8\n"
+      "blt 14f\n"
+      "13:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "add x14, x14, #0x10\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "cmp x15, #0x8\n"
+      "add x7, x7, #0x40\n"
+      "bge 13b\n"
+      "14:"  // Height 1: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "add x14, x14, #0x10\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "15:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x15, 17f\n"
+      "16:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "add x7, x7, #0x10\n"
+      "cbnz x15, 16b\n"
+      "17:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 10b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "tbz %x[flags], #1, 18f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "18:"  // Height 1: No activation
+      "cmp x6, #0x4\n"
+      "bge 21f\n"
+      "tbz x6, #1, 19f\n"
+      "str d24, [x17], #0x8\n"
+      "tbz x6, #0, 20f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "b 20f\n"
+      "19:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "20:"  // Height 1: Partial direct writeback: Done
+      "b 22f\n"
+      "21:"  // Height 1: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "22:"  // Height 1: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 3b\n"
+      "b 178f\n"
+      "23:"  // Height 2
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 24f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 25f\n"
+      "24:"  // Height 2: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "25:"  // Height 2: Column loop
+      "cbz x8, 26f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "b 31f\n"
+      "26:"  // Height 2: no bias
+      "tbz %x[flags], #0, 30f\n"
+      "cmp x6, #0x4\n"
+      "bge 29f\n"
+      "tbz x6, #1, 27f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 28f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "b 28f\n"
+      "27:"  // Height 2: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "28:"  // Height 2: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "b 31f\n"
+      "29:"  // Height 2: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "b 31f\n"
+      "30:"  // Height 2: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "31:"  // Height 2: setup done
+      "mov x16, #0x0\n"
+      "32:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 33f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "cbnz x16, 34f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "b 34f\n"
+      "33:"  // Height 2: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "34:"  // Height 2: input setup done
+      "cmp x15, #0x4\n"
+      "blt 37f\n"
+      "cmp x15, #0x8\n"
+      "blt 36f\n"
+      "35:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "cmp x15, #0x8\n"
+      "add x7, x7, #0x40\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "bge 35b\n"
+      "36:"  // Height 2: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "37:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x15, 39f\n"
+      "38:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "cbnz x15, 38b\n"
+      "39:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 32b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 40f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "40:"  // Height 2: No activation
+      "cmp x6, #0x4\n"
+      "bge 43f\n"
+      "tbz x6, #1, 41f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "tbz x6, #0, 42f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "b 42f\n"
+      "41:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "42:"  // Height 2: Partial direct writeback: Done
+      "b 44f\n"
+      "43:"  // Height 2: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "44:"  // Height 2: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 25b\n"
+      "b 178f\n"
+      "45:"  // Height 3
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 46f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "add x11, x11, x19, LSL #2\n"
+      "b 47f\n"
+      "46:"  // Height 3: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "47:"  // Height 3: Column loop
+      "cbz x8, 48f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "mov v26.16b, v24.16b\n"
+      "b 53f\n"
+      "48:"  // Height 3: no bias
+      "tbz %x[flags], #0, 52f\n"
+      "cmp x6, #0x4\n"
+      "bge 51f\n"
+      "tbz x6, #1, 49f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x11], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 50f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x11]\n"
+      "b 50f\n"
+      "49:"  // Height 3: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x11, #0x0]\n"
+      "50:"  // Height 3: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "sub x11, x11, x19\n"
+      "b 53f\n"
+      "51:"  // Height 3: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x11, #0x0]\n"
+      "b 53f\n"
+      "52:"  // Height 3: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "53:"  // Height 3: setup done
+      "mov x16, #0x0\n"
+      "54:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 55f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "cbnz x16, 56f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "b 56f\n"
+      "55:"  // Height 3: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "56:"  // Height 3: input setup done
+      "cmp x15, #0x4\n"
+      "blt 59f\n"
+      "cmp x15, #0x8\n"
+      "blt 58f\n"
+      "57:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v26.4s, v8.4s, v2.s[0]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "add x14, x14, #0x10\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v26.4s, v9.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "cmp x15, #0x8\n"
+      "add x7, x7, #0x40\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "bge 57b\n"
+      "58:"  // Height 3: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v26.4s, v12.4s, v2.s[0]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "add x14, x14, #0x10\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v26.4s, v13.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v26.4s, v14.4s, v2.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "fmla v26.4s, v15.4s, v2.s[3]\n"
+      "59:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x15, 61f\n"
+      "60:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s2, [x10], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "fmla v26.4s, v16.4s, v2.s[0]\n"
+      "cbnz x15, 60b\n"
+      "61:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 54b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "tbz %x[flags], #1, 62f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "62:"  // Height 3: No activation
+      "cmp x6, #0x4\n"
+      "bge 65f\n"
+      "tbz x6, #1, 63f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x11], #0x8\n"
+      "tbz x6, #0, 64f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x11]\n"
+      "b 64f\n"
+      "63:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x11, #0x0]\n"
+      "64:"  // Height 3: Partial direct writeback: Done
+      "b 66f\n"
+      "65:"  // Height 3: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x11, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "66:"  // Height 3: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 47b\n"
+      "b 178f\n"
+      "67:"  // Height 4
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 68f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 69f\n"
+      "68:"  // Height 4: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "69:"  // Height 4: Column loop
+      "cbz x8, 70f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "mov v26.16b, v24.16b\n"
+      "mov v27.16b, v24.16b\n"
+      "b 75f\n"
+      "70:"  // Height 4: no bias
+      "tbz %x[flags], #0, 74f\n"
+      "cmp x6, #0x4\n"
+      "bge 73f\n"
+      "tbz x6, #1, 71f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x11], #0x8\n"
+      "ldr d27, [x9], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 72f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x11]\n"
+      "ld1 { v27.s }[2], [x9]\n"
+      "b 72f\n"
+      "71:"  // Height 4: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x11, #0x0]\n"
+      "ldr s27, [x9, #0x0]\n"
+      "72:"  // Height 4: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "sub x11, x11, x19\n"
+      "sub x9, x9, x19\n"
+      "b 75f\n"
+      "73:"  // Height 4: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x11, #0x0]\n"
+      "ldr q27, [x9, #0x0]\n"
+      "b 75f\n"
+      "74:"  // Height 4: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "75:"  // Height 4: setup done
+      "mov x16, #0x0\n"
+      "76:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 77f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "cbnz x16, 78f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "b 78f\n"
+      "77:"  // Height 4: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "78:"  // Height 4: input setup done
+      "cmp x15, #0x4\n"
+      "blt 81f\n"
+      "cmp x15, #0x8\n"
+      "blt 80f\n"
+      "79:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v26.4s, v8.4s, v2.s[0]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "fmla v27.4s, v8.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "add x12, x12, #0x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v26.4s, v9.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "cmp x15, #0x8\n"
+      "add x7, x7, #0x40\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "fmla v27.4s, v11.4s, v3.s[3]\n"
+      "bge 79b\n"
+      "80:"  // Height 4: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v26.4s, v12.4s, v2.s[0]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "fmla v27.4s, v12.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "add x12, x12, #0x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v26.4s, v13.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v27.4s, v13.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v26.4s, v14.4s, v2.s[2]\n"
+      "fmla v27.4s, v14.4s, v3.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "fmla v26.4s, v15.4s, v2.s[3]\n"
+      "fmla v27.4s, v15.4s, v3.s[3]\n"
+      "81:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x15, 83f\n"
+      "82:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s2, [x10], #0x4\n"
+      "ldr s3, [x28], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "fmla v26.4s, v16.4s, v2.s[0]\n"
+      "fmla v27.4s, v16.4s, v3.s[0]\n"
+      "cbnz x15, 82b\n"
+      "83:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 76b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 84f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "84:"  // Height 4: No activation
+      "cmp x6, #0x4\n"
+      "bge 87f\n"
+      "tbz x6, #1, 85f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x11], #0x8\n"
+      "str d27, [x9], #0x8\n"
+      "tbz x6, #0, 86f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x11]\n"
+      "st1 { v27.s }[2], [x9]\n"
+      "b 86f\n"
+      "85:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x11, #0x0]\n"
+      "str s27, [x9, #0x0]\n"
+      "86:"  // Height 4: Partial direct writeback: Done
+      "b 88f\n"
+      "87:"  // Height 4: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x11, #0x0]\n"
+      "str q27, [x9, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "88:"  // Height 4: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 69b\n"
+      "b 178f\n"
+      "89:"  // Height 5
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 90f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 91f\n"
+      "90:"  // Height 5: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "91:"  // Height 5: Column loop
+      "cbz x8, 92f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "mov v26.16b, v24.16b\n"
+      "mov v27.16b, v24.16b\n"
+      "mov v28.16b, v24.16b\n"
+      "b 97f\n"
+      "92:"  // Height 5: no bias
+      "tbz %x[flags], #0, 96f\n"
+      "cmp x6, #0x4\n"
+      "bge 95f\n"
+      "tbz x6, #1, 93f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x11], #0x8\n"
+      "ldr d27, [x9], #0x8\n"
+      "ldr d28, [x27], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 94f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x11]\n"
+      "ld1 { v27.s }[2], [x9]\n"
+      "ld1 { v28.s }[2], [x27]\n"
+      "b 94f\n"
+      "93:"  // Height 5: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x11, #0x0]\n"
+      "ldr s27, [x9, #0x0]\n"
+      "ldr s28, [x27, #0x0]\n"
+      "94:"  // Height 5: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "sub x11, x11, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "b 97f\n"
+      "95:"  // Height 5: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x11, #0x0]\n"
+      "ldr q27, [x9, #0x0]\n"
+      "ldr q28, [x27, #0x0]\n"
+      "b 97f\n"
+      "96:"  // Height 5: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "97:"  // Height 5: setup done
+      "mov x16, #0x0\n"
+      "98:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 99f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "cbnz x16, 100f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "b 100f\n"
+      "99:"  // Height 5: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "100:"  // Height 5: input setup done
+      "cmp x15, #0x4\n"
+      "blt 103f\n"
+      "cmp x15, #0x8\n"
+      "blt 102f\n"
+      "101:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v26.4s, v8.4s, v2.s[0]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "fmla v27.4s, v8.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v8.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v26.4s, v9.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v28.4s, v9.4s, v4.s[1]\n"
+      "cmp x15, #0x8\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "fmla v28.4s, v10.4s, v4.s[2]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "fmla v27.4s, v11.4s, v3.s[3]\n"
+      "fmla v28.4s, v11.4s, v4.s[3]\n"
+      "bge 101b\n"
+      "102:"  // Height 5: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v26.4s, v12.4s, v2.s[0]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "fmla v27.4s, v12.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v12.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v26.4s, v13.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v27.4s, v13.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v28.4s, v13.4s, v4.s[1]\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v26.4s, v14.4s, v2.s[2]\n"
+      "fmla v27.4s, v14.4s, v3.s[2]\n"
+      "fmla v28.4s, v14.4s, v4.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "fmla v26.4s, v15.4s, v2.s[3]\n"
+      "fmla v27.4s, v15.4s, v3.s[3]\n"
+      "fmla v28.4s, v15.4s, v4.s[3]\n"
+      "103:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x15, 105f\n"
+      "104:"  // Height 5: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s2, [x10], #0x4\n"
+      "ldr s3, [x28], #0x4\n"
+      "ldr s4, [x26], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "fmla v26.4s, v16.4s, v2.s[0]\n"
+      "fmla v27.4s, v16.4s, v3.s[0]\n"
+      "fmla v28.4s, v16.4s, v4.s[0]\n"
+      "cbnz x15, 104b\n"
+      "105:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 98b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 106f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "106:"  // Height 5: No activation
+      "cmp x6, #0x4\n"
+      "bge 109f\n"
+      "tbz x6, #1, 107f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x11], #0x8\n"
+      "str d27, [x9], #0x8\n"
+      "str d28, [x27], #0x8\n"
+      "tbz x6, #0, 108f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x11]\n"
+      "st1 { v27.s }[2], [x9]\n"
+      "st1 { v28.s }[2], [x27]\n"
+      "b 108f\n"
+      "107:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x11, #0x0]\n"
+      "str s27, [x9, #0x0]\n"
+      "str s28, [x27, #0x0]\n"
+      "108:"  // Height 5: Partial direct writeback: Done
+      "b 110f\n"
+      "109:"  // Height 5: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x11, #0x0]\n"
+      "str q27, [x9, #0x0]\n"
+      "str q28, [x27, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "110:"  // Height 5: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 91b\n"
+      "b 178f\n"
+      "111:"  // Height 6
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 112f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x28]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 113f\n"
+      "112:"  // Height 6: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "113:"  // Height 6: Column loop
+      "cbz x8, 114f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "mov v26.16b, v24.16b\n"
+      "mov v27.16b, v24.16b\n"
+      "mov v28.16b, v24.16b\n"
+      "mov v29.16b, v24.16b\n"
+      "b 119f\n"
+      "114:"  // Height 6: no bias
+      "tbz %x[flags], #0, 118f\n"
+      "cmp x6, #0x4\n"
+      "bge 117f\n"
+      "tbz x6, #1, 115f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x11], #0x8\n"
+      "ldr d27, [x9], #0x8\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d29, [x25], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 116f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x11]\n"
+      "ld1 { v27.s }[2], [x9]\n"
+      "ld1 { v28.s }[2], [x27]\n"
+      "ld1 { v29.s }[2], [x25]\n"
+      "b 116f\n"
+      "115:"  // Height 6: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x11, #0x0]\n"
+      "ldr s27, [x9, #0x0]\n"
+      "ldr s28, [x27, #0x0]\n"
+      "ldr s29, [x25, #0x0]\n"
+      "116:"  // Height 6: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "sub x11, x11, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "b 119f\n"
+      "117:"  // Height 6: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x11, #0x0]\n"
+      "ldr q27, [x9, #0x0]\n"
+      "ldr q28, [x27, #0x0]\n"
+      "ldr q29, [x25, #0x0]\n"
+      "b 119f\n"
+      "118:"  // Height 6: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "119:"  // Height 6: setup done
+      "mov x16, #0x0\n"
+      "120:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 121f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "ldr x24, [x20, #0x28]\n"
+      "cbnz x16, 122f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "b 122f\n"
+      "121:"  // Height 6: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "122:"  // Height 6: input setup done
+      "cmp x15, #0x4\n"
+      "blt 125f\n"
+      "cmp x15, #0x8\n"
+      "blt 124f\n"
+      "123:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q5, [x24, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v26.4s, v8.4s, v2.s[0]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "fmla v27.4s, v8.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v8.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v29.4s, v8.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v26.4s, v9.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v28.4s, v9.4s, v4.s[1]\n"
+      "cmp x15, #0x8\n"
+      "fmla v29.4s, v9.4s, v5.s[1]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "fmla v28.4s, v10.4s, v4.s[2]\n"
+      "fmla v29.4s, v10.4s, v5.s[2]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "fmla v27.4s, v11.4s, v3.s[3]\n"
+      "fmla v28.4s, v11.4s, v4.s[3]\n"
+      "fmla v29.4s, v11.4s, v5.s[3]\n"
+      "bge 123b\n"
+      "124:"  // Height 6: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q5, [x24, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v26.4s, v12.4s, v2.s[0]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "fmla v27.4s, v12.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v12.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v29.4s, v12.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v26.4s, v13.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v27.4s, v13.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v28.4s, v13.4s, v4.s[1]\n"
+      "fmla v29.4s, v13.4s, v5.s[1]\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v26.4s, v14.4s, v2.s[2]\n"
+      "fmla v27.4s, v14.4s, v3.s[2]\n"
+      "fmla v28.4s, v14.4s, v4.s[2]\n"
+      "fmla v29.4s, v14.4s, v5.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "fmla v26.4s, v15.4s, v2.s[3]\n"
+      "fmla v27.4s, v15.4s, v3.s[3]\n"
+      "fmla v28.4s, v15.4s, v4.s[3]\n"
+      "fmla v29.4s, v15.4s, v5.s[3]\n"
+      "125:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x15, 127f\n"
+      "126:"  // Height 6: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s2, [x10], #0x4\n"
+      "ldr s3, [x28], #0x4\n"
+      "ldr s4, [x26], #0x4\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "fmla v26.4s, v16.4s, v2.s[0]\n"
+      "fmla v27.4s, v16.4s, v3.s[0]\n"
+      "fmla v28.4s, v16.4s, v4.s[0]\n"
+      "fmla v29.4s, v16.4s, v5.s[0]\n"
+      "cbnz x15, 126b\n"
+      "127:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 120b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 128f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "fmax v29.4s, v29.4s, v17.4s\n"
+      "128:"  // Height 6: No activation
+      "cmp x6, #0x4\n"
+      "bge 131f\n"
+      "tbz x6, #1, 129f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x11], #0x8\n"
+      "str d27, [x9], #0x8\n"
+      "str d28, [x27], #0x8\n"
+      "str d29, [x25], #0x8\n"
+      "tbz x6, #0, 130f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x11]\n"
+      "st1 { v27.s }[2], [x9]\n"
+      "st1 { v28.s }[2], [x27]\n"
+      "st1 { v29.s }[2], [x25]\n"
+      "b 130f\n"
+      "129:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x11, #0x0]\n"
+      "str s27, [x9, #0x0]\n"
+      "str s28, [x27, #0x0]\n"
+      "str s29, [x25, #0x0]\n"
+      "130:"  // Height 6: Partial direct writeback: Done
+      "b 132f\n"
+      "131:"  // Height 6: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x11, #0x0]\n"
+      "str q27, [x9, #0x0]\n"
+      "str q28, [x27, #0x0]\n"
+      "str q29, [x25, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "132:"  // Height 6: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 113b\n"
+      "b 178f\n"
+      "133:"  // Height 7
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 134f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x28]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x23, [%x[output_ptr], #0x30]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 135f\n"
+      "134:"  // Height 7: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "135:"  // Height 7: Column loop
+      "cbz x8, 136f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "mov v26.16b, v24.16b\n"
+      "mov v27.16b, v24.16b\n"
+      "mov v28.16b, v24.16b\n"
+      "mov v29.16b, v24.16b\n"
+      "mov v30.16b, v24.16b\n"
+      "b 141f\n"
+      "136:"  // Height 7: no bias
+      "tbz %x[flags], #0, 140f\n"
+      "cmp x6, #0x4\n"
+      "bge 139f\n"
+      "tbz x6, #1, 137f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x11], #0x8\n"
+      "ldr d27, [x9], #0x8\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d29, [x25], #0x8\n"
+      "ldr d30, [x23], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 138f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x11]\n"
+      "ld1 { v27.s }[2], [x9]\n"
+      "ld1 { v28.s }[2], [x27]\n"
+      "ld1 { v29.s }[2], [x25]\n"
+      "ld1 { v30.s }[2], [x23]\n"
+      "b 138f\n"
+      "137:"  // Height 7: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x11, #0x0]\n"
+      "ldr s27, [x9, #0x0]\n"
+      "ldr s28, [x27, #0x0]\n"
+      "ldr s29, [x25, #0x0]\n"
+      "ldr s30, [x23, #0x0]\n"
+      "138:"  // Height 7: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "sub x11, x11, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "b 141f\n"
+      "139:"  // Height 7: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x11, #0x0]\n"
+      "ldr q27, [x9, #0x0]\n"
+      "ldr q28, [x27, #0x0]\n"
+      "ldr q29, [x25, #0x0]\n"
+      "ldr q30, [x23, #0x0]\n"
+      "b 141f\n"
+      "140:"  // Height 7: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "141:"  // Height 7: setup done
+      "mov x16, #0x0\n"
+      "142:"  // Height 7: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 143f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "ldr x24, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "cbnz x16, 144f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 144f\n"
+      "143:"  // Height 7: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "144:"  // Height 7: input setup done
+      "cmp x15, #0x4\n"
+      "blt 147f\n"
+      "cmp x15, #0x8\n"
+      "blt 146f\n"
+      "145:"  // Height 7: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q5, [x24, #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v26.4s, v8.4s, v2.s[0]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "fmla v27.4s, v8.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v8.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v29.4s, v8.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v30.4s, v8.4s, v6.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v26.4s, v9.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v28.4s, v9.4s, v4.s[1]\n"
+      "cmp x15, #0x8\n"
+      "fmla v29.4s, v9.4s, v5.s[1]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v30.4s, v9.4s, v6.s[1]\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "fmla v28.4s, v10.4s, v4.s[2]\n"
+      "fmla v29.4s, v10.4s, v5.s[2]\n"
+      "fmla v30.4s, v10.4s, v6.s[2]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "fmla v27.4s, v11.4s, v3.s[3]\n"
+      "fmla v28.4s, v11.4s, v4.s[3]\n"
+      "fmla v29.4s, v11.4s, v5.s[3]\n"
+      "fmla v30.4s, v11.4s, v6.s[3]\n"
+      "bge 145b\n"
+      "146:"  // Height 7: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q5, [x24, #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v26.4s, v12.4s, v2.s[0]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "fmla v27.4s, v12.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v12.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v29.4s, v12.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v30.4s, v12.4s, v6.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v26.4s, v13.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v27.4s, v13.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v28.4s, v13.4s, v4.s[1]\n"
+      "fmla v29.4s, v13.4s, v5.s[1]\n"
+      "fmla v30.4s, v13.4s, v6.s[1]\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v26.4s, v14.4s, v2.s[2]\n"
+      "fmla v27.4s, v14.4s, v3.s[2]\n"
+      "fmla v28.4s, v14.4s, v4.s[2]\n"
+      "fmla v29.4s, v14.4s, v5.s[2]\n"
+      "fmla v30.4s, v14.4s, v6.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "fmla v26.4s, v15.4s, v2.s[3]\n"
+      "fmla v27.4s, v15.4s, v3.s[3]\n"
+      "fmla v28.4s, v15.4s, v4.s[3]\n"
+      "fmla v29.4s, v15.4s, v5.s[3]\n"
+      "fmla v30.4s, v15.4s, v6.s[3]\n"
+      "147:"  // Height 7: Multiply loop: Main loop skip
+      "cbz x15, 149f\n"
+      "148:"  // Height 7: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s2, [x10], #0x4\n"
+      "ldr s3, [x28], #0x4\n"
+      "ldr s4, [x26], #0x4\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s6, [x22], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "fmla v26.4s, v16.4s, v2.s[0]\n"
+      "fmla v27.4s, v16.4s, v3.s[0]\n"
+      "fmla v28.4s, v16.4s, v4.s[0]\n"
+      "fmla v29.4s, v16.4s, v5.s[0]\n"
+      "fmla v30.4s, v16.4s, v6.s[0]\n"
+      "cbnz x15, 148b\n"
+      "149:"  // Height 7: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 142b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 150f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v16.4s\n"
+      "fmin v30.4s, v30.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "fmax v29.4s, v29.4s, v17.4s\n"
+      "fmax v30.4s, v30.4s, v17.4s\n"
+      "150:"  // Height 7: No activation
+      "cmp x6, #0x4\n"
+      "bge 153f\n"
+      "tbz x6, #1, 151f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x11], #0x8\n"
+      "str d27, [x9], #0x8\n"
+      "str d28, [x27], #0x8\n"
+      "str d29, [x25], #0x8\n"
+      "str d30, [x23], #0x8\n"
+      "tbz x6, #0, 152f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x11]\n"
+      "st1 { v27.s }[2], [x9]\n"
+      "st1 { v28.s }[2], [x27]\n"
+      "st1 { v29.s }[2], [x25]\n"
+      "st1 { v30.s }[2], [x23]\n"
+      "b 152f\n"
+      "151:"  // Height 7: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x11, #0x0]\n"
+      "str s27, [x9, #0x0]\n"
+      "str s28, [x27, #0x0]\n"
+      "str s29, [x25, #0x0]\n"
+      "str s30, [x23, #0x0]\n"
+      "152:"  // Height 7: Partial direct writeback: Done
+      "b 154f\n"
+      "153:"  // Height 7: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x11, #0x0]\n"
+      "str q27, [x9, #0x0]\n"
+      "str q28, [x27, #0x0]\n"
+      "str q29, [x25, #0x0]\n"
+      "str q30, [x23, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "154:"  // Height 7: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 135b\n"
+      "b 178f\n"
+      "155:"  // Height 8
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 156f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x28]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x23, [%x[output_ptr], #0x30]\n"
+      "ldr x21, [%x[output_ptr], #0x38]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add %x[output_ptr], %x[output_ptr], #0x40\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 157f\n"
+      "156:"  // Height 8: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "157:"  // Height 8: Column loop
+      "cbz x8, 158f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "mov v26.16b, v24.16b\n"
+      "mov v27.16b, v24.16b\n"
+      "mov v28.16b, v24.16b\n"
+      "mov v29.16b, v24.16b\n"
+      "mov v30.16b, v24.16b\n"
+      "mov v31.16b, v24.16b\n"
+      "b 163f\n"
+      "158:"  // Height 8: no bias
+      "tbz %x[flags], #0, 162f\n"
+      "cmp x6, #0x4\n"
+      "bge 161f\n"
+      "tbz x6, #1, 159f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x11], #0x8\n"
+      "ldr d27, [x9], #0x8\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d29, [x25], #0x8\n"
+      "ldr d30, [x23], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 160f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x11]\n"
+      "ld1 { v27.s }[2], [x9]\n"
+      "ld1 { v28.s }[2], [x27]\n"
+      "ld1 { v29.s }[2], [x25]\n"
+      "ld1 { v30.s }[2], [x23]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 160f\n"
+      "159:"  // Height 8: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x11, #0x0]\n"
+      "ldr s27, [x9, #0x0]\n"
+      "ldr s28, [x27, #0x0]\n"
+      "ldr s29, [x25, #0x0]\n"
+      "ldr s30, [x23, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "160:"  // Height 8: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "sub x11, x11, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "sub x21, x21, x19\n"
+      "b 163f\n"
+      "161:"  // Height 8: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x11, #0x0]\n"
+      "ldr q27, [x9, #0x0]\n"
+      "ldr q28, [x27, #0x0]\n"
+      "ldr q29, [x25, #0x0]\n"
+      "ldr q30, [x23, #0x0]\n"
+      "ldr q31, [x21, #0x0]\n"
+      "b 163f\n"
+      "162:"  // Height 8: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "163:"  // Height 8: setup done
+      "mov x16, #0x0\n"
+      "164:"  // Height 8: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 165f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "ldr x24, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "ldr x20, [x20, #0x38]\n"
+      "cbnz x16, 166f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 166f\n"
+      "165:"  // Height 8: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "add x20, x22, x19, LSL #2\n"
+      "166:"  // Height 8: input setup done
+      "cmp x15, #0x4\n"
+      "blt 169f\n"
+      "cmp x15, #0x8\n"
+      "blt 168f\n"
+      "167:"  // Height 8: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q5, [x24, #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      "ldr q7, [x20, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v26.4s, v8.4s, v2.s[0]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "fmla v27.4s, v8.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v8.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v29.4s, v8.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v30.4s, v8.4s, v6.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v31.4s, v8.4s, v7.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v26.4s, v9.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v28.4s, v9.4s, v4.s[1]\n"
+      "cmp x15, #0x8\n"
+      "fmla v29.4s, v9.4s, v5.s[1]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v30.4s, v9.4s, v6.s[1]\n"
+      "fmla v31.4s, v9.4s, v7.s[1]\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "fmla v28.4s, v10.4s, v4.s[2]\n"
+      "fmla v29.4s, v10.4s, v5.s[2]\n"
+      "fmla v30.4s, v10.4s, v6.s[2]\n"
+      "fmla v31.4s, v10.4s, v7.s[2]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "fmla v27.4s, v11.4s, v3.s[3]\n"
+      "fmla v28.4s, v11.4s, v4.s[3]\n"
+      "fmla v29.4s, v11.4s, v5.s[3]\n"
+      "fmla v30.4s, v11.4s, v6.s[3]\n"
+      "fmla v31.4s, v11.4s, v7.s[3]\n"
+      "bge 167b\n"
+      "168:"  // Height 8: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q5, [x24, #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      "ldr q7, [x20, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v26.4s, v12.4s, v2.s[0]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "fmla v27.4s, v12.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v12.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v29.4s, v12.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v30.4s, v12.4s, v6.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v31.4s, v12.4s, v7.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v26.4s, v13.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v27.4s, v13.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v28.4s, v13.4s, v4.s[1]\n"
+      "fmla v29.4s, v13.4s, v5.s[1]\n"
+      "fmla v30.4s, v13.4s, v6.s[1]\n"
+      "fmla v31.4s, v13.4s, v7.s[1]\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v26.4s, v14.4s, v2.s[2]\n"
+      "fmla v27.4s, v14.4s, v3.s[2]\n"
+      "fmla v28.4s, v14.4s, v4.s[2]\n"
+      "fmla v29.4s, v14.4s, v5.s[2]\n"
+      "fmla v30.4s, v14.4s, v6.s[2]\n"
+      "fmla v31.4s, v14.4s, v7.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "fmla v26.4s, v15.4s, v2.s[3]\n"
+      "fmla v27.4s, v15.4s, v3.s[3]\n"
+      "fmla v28.4s, v15.4s, v4.s[3]\n"
+      "fmla v29.4s, v15.4s, v5.s[3]\n"
+      "fmla v30.4s, v15.4s, v6.s[3]\n"
+      "fmla v31.4s, v15.4s, v7.s[3]\n"
+      "169:"  // Height 8: Multiply loop: Main loop skip
+      "cbz x15, 171f\n"
+      "170:"  // Height 8: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s2, [x10], #0x4\n"
+      "ldr s3, [x28], #0x4\n"
+      "ldr s4, [x26], #0x4\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s6, [x22], #0x4\n"
+      "ldr s7, [x20], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "fmla v26.4s, v16.4s, v2.s[0]\n"
+      "fmla v27.4s, v16.4s, v3.s[0]\n"
+      "fmla v28.4s, v16.4s, v4.s[0]\n"
+      "fmla v29.4s, v16.4s, v5.s[0]\n"
+      "fmla v30.4s, v16.4s, v6.s[0]\n"
+      "fmla v31.4s, v16.4s, v7.s[0]\n"
+      "cbnz x15, 170b\n"
+      "171:"  // Height 8: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 164b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 172f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v16.4s\n"
+      "fmin v30.4s, v30.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "fmax v29.4s, v29.4s, v17.4s\n"
+      "fmax v30.4s, v30.4s, v17.4s\n"
+      "fmin v31.4s, v31.4s, v16.4s\n"
+      "fmax v31.4s, v31.4s, v17.4s\n"
+      "172:"  // Height 8: No activation
+      "cmp x6, #0x4\n"
+      "bge 175f\n"
+      "tbz x6, #1, 173f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x11], #0x8\n"
+      "str d27, [x9], #0x8\n"
+      "str d28, [x27], #0x8\n"
+      "str d29, [x25], #0x8\n"
+      "str d30, [x23], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x6, #0, 174f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x11]\n"
+      "st1 { v27.s }[2], [x9]\n"
+      "st1 { v28.s }[2], [x27]\n"
+      "st1 { v29.s }[2], [x25]\n"
+      "st1 { v30.s }[2], [x23]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 174f\n"
+      "173:"  // Height 8: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x11, #0x0]\n"
+      "str s27, [x9, #0x0]\n"
+      "str s28, [x27, #0x0]\n"
+      "str s29, [x25, #0x0]\n"
+      "str s30, [x23, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "174:"  // Height 8: Partial direct writeback: Done
+      "b 176f\n"
+      "175:"  // Height 8: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x11, #0x0]\n"
+      "str q27, [x9, #0x0]\n"
+      "str q28, [x27, #0x0]\n"
+      "str q29, [x25, #0x0]\n"
+      "str q30, [x23, #0x0]\n"
+      "str q31, [x21, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x21, x21, #0x10\n"
+      "176:"  // Height 8: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 157b\n"
+      "subs %x[M], %x[M], #0x8\n"
+      "beq 178f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 177f\n"
+      "add x20, x20, #0x8\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "177:"  // Update direct input
+      "mov x19, #0x20\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "178:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index a23101a7ce..4bb7a1e0eb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,38 +10,43 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
 
-#include <cstdint>
 #include "../std_transforms_fixed.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<int8_t>, \
+   size_t, size_t, \
+   const int8_t *, \
+   IndirectOutputArg<int8_t>, \
+   const Requantize32 *, const int32_t *, unsigned int
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_hybrid_s8s32_dot_16x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_hybrid_s8qa_dot_4x16( ARGLIST );
 
-class hybrid_s8s32_dot_16x4
+class cls_a64_hybrid_s8qa_dot_4x16
 {
 public:
     typedef int8_t operand_type;
-    typedef int32_t result_type;
+    typedef int8_t result_type;
 
-    typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
@@ -61,32 +66,20 @@ public:
 
     static constexpr bool supports_accumulate()
     {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_activation()
-    {
         return false;
     }
 
     StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_hybrid_s8s32_dot_16x4;
+    kern_type kernel=a64_hybrid_s8qa_dot_4x16;
 
-    hybrid_s8s32_dot_16x4(const CPUInfo *ci)
+    cls_a64_hybrid_s8qa_dot_4x16(const CPUInfo *)
     {
-        if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_hybrid_s8s32_dot_16x4_a55;
-        }
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
new file mode 100644
index 0000000000..3fb365bc1e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
@@ -0,0 +1,2072 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qa_dot_4x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 94f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 63f\n"
+      "beq 32f\n"
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "add x9, x9, x19\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "blt 12f\n"
+      "cmp x27, #0x20\n"
+      "blt 10f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      "ldr q6, [x11, #0x20]\n"
+      ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x30]\n"
+      ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q8, [x11, #0x40]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
+      ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
+      "ldr q5, [x11, #0x80]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      "tbnz %x[flags], #31, 9f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "9:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 8b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      "ldr q8, [x11, #0x20]\n"
+      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x30]\n"
+      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x40]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x4fa0e150  // sdot v16.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x4fa0e091  // sdot v17.4s, v4.16b, v0.4b[1]\n"
+      "ldr q7, [x11, #0x80]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x4fa0e0b2  // sdot v18.4s, v5.16b, v0.4b[1]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x4fa0e0d3  // sdot v19.4s, v6.16b, v0.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x4f80e8f0  // sdot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f80e911  // sdot v17.4s, v8.16b, v0.4b[2]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x4f80e932  // sdot v18.4s, v9.16b, v0.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f80e953  // sdot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e890  // sdot v16.4s, v4.16b, v0.4b[3]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4fa0e8b1  // sdot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8d2  // sdot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8f3  // sdot v19.4s, v7.16b, v0.4b[3]\n"
+      "tbnz %x[flags], #31, 11f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "11:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "12:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 19f\n"
+      "cmp x27, #0x4\n"
+      "blt 15f\n"
+      "13:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "tbnz %x[flags], #31, 14f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "14:"  // Height 1: Multiply loop: unique 3: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x4f80e110  // sdot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      "ldr q10, [x11, #0x20]\n"
+      ".inst 0x4f80e131  // sdot v17.4s, v9.16b, v0.4b[0]\n"
+      "ldr q4, [x11, #0x30]\n"
+      ".inst 0x4f80e152  // sdot v18.4s, v10.16b, v0.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      "cmp x27, #0x4\n"
+      "bge 13b\n"
+      "cbz x27, 19f\n"
+      "15:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 16f\n"
+      "ldr h0, [x26], #0x2\n"
+      "tbz x27, #0, 17f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "b 17f\n"
+      "16:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "17:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 18f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "18:"  // Height 1: Multiply loop: unique 4: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x4f80e0b0  // sdot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      "ldr q7, [x11, #0x20]\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      "ldr q8, [x11, #0x30]\n"
+      ".inst 0x4f80e0f2  // sdot v18.4s, v7.16b, v0.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f80e113  // sdot v19.4s, v8.16b, v0.4b[0]\n"
+      "19:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 5b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbnz %x[flags], #31, 20f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "neg v1.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v1.4s\n"
+      "20:"  // Height 1: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add x10, x10, #0x40\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "tbz %x[flags], #5, 21f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "21:"  // Height 1: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 30f\n"
+      "tbz x12, #3, 25f\n"
+      "str d16, [x9], #0x8\n"
+      "tbz x12, #2, 23f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "tbz x12, #1, 22f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "b 29f\n"
+      "22:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "b 29f\n"
+      "23:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 24f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "b 29f\n"
+      "24:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "b 29f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 27f\n"
+      "str s16, [x9], #0x4\n"
+      "tbz x12, #1, 26f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "b 29f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "b 29f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 28f\n"
+      "str h16, [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "b 29f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "29:"  // Height 1: Partial direct writeback: Done
+      "b 31f\n"
+      "30:"  // Height 1: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "31:"  // Height 1: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 3b\n"
+      "b 126f\n"
+      "32:"  // Height 2
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 33f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "add x25, x25, x19\n"
+      "b 34f\n"
+      "33:"  // Height 2: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "34:"  // Height 2: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "35:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "36:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 37f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x28, 38f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 38f\n"
+      "37:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "38:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "blt 43f\n"
+      "cmp x27, #0x20\n"
+      "blt 41f\n"
+      "39:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x20]\n"
+      "ldr q7, [x11, #0x30]\n"
+      ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q8, [x11, #0x40]\n"
+      ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x80]\n"
+      ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      "tbnz %x[flags], #31, 40f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      "40:"  // Height 2: Multiply loop: unique 5: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x20\n"
+      "bge 39b\n"
+      "41:"  // Height 2: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x20]\n"
+      "ldr q9, [x11, #0x30]\n"
+      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x40]\n"
+      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x80]\n"
+      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x4fa0e150  // sdot v16.4s, v10.16b, v0.4b[1]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x4fa1e154  // sdot v20.4s, v10.16b, v1.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      ".inst 0x4fa0e091  // sdot v17.4s, v4.16b, v0.4b[1]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4fa1e095  // sdot v21.4s, v4.16b, v1.4b[1]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x4fa0e0b2  // sdot v18.4s, v5.16b, v0.4b[1]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4fa1e0b6  // sdot v22.4s, v5.16b, v1.4b[1]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      ".inst 0x4fa0e0d3  // sdot v19.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0d7  // sdot v23.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x4f80e8f0  // sdot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8f4  // sdot v20.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      ".inst 0x4f80e911  // sdot v17.4s, v8.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4f81e915  // sdot v21.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x4f80e932  // sdot v18.4s, v9.16b, v0.4b[2]\n"
+      ".inst 0x4f81e936  // sdot v22.4s, v9.16b, v1.4b[2]\n"
+      ".inst 0x4f80e953  // sdot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x4f81e957  // sdot v23.4s, v10.16b, v1.4b[2]\n"
+      ".inst 0x4fa0e890  // sdot v16.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e894  // sdot v20.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8b1  // sdot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8b5  // sdot v21.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8d2  // sdot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8d6  // sdot v22.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8f3  // sdot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8f7  // sdot v23.4s, v7.16b, v1.4b[3]\n"
+      "tbnz %x[flags], #31, 42f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      "42:"  // Height 2: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "43:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 50f\n"
+      "cmp x27, #0x4\n"
+      "blt 46f\n"
+      "44:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "ldr s1, [x24], #0x4\n"
+      "tbnz %x[flags], #31, 45f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      "45:"  // Height 2: Multiply loop: unique 7: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x4f80e110  // sdot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      ".inst 0x4f81e114  // sdot v20.4s, v8.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x20]\n"
+      "ldr q4, [x11, #0x30]\n"
+      ".inst 0x4f80e131  // sdot v17.4s, v9.16b, v0.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x4f81e135  // sdot v21.4s, v9.16b, v1.4b[0]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x4f80e152  // sdot v18.4s, v10.16b, v0.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f81e156  // sdot v22.4s, v10.16b, v1.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      "bge 44b\n"
+      "cbz x27, 50f\n"
+      "46:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 47f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x24], #0x2\n"
+      "tbz x27, #0, 48f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "b 48f\n"
+      "47:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x24, #0x0]\n"
+      "48:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 49f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      "49:"  // Height 2: Multiply loop: unique 8: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x4f80e0b0  // sdot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x4f81e0b4  // sdot v20.4s, v5.16b, v1.4b[0]\n"
+      "ldr q7, [x11, #0x20]\n"
+      "ldr q8, [x11, #0x30]\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e0f2  // sdot v18.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0f6  // sdot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f80e113  // sdot v19.4s, v8.16b, v0.4b[0]\n"
+      ".inst 0x4f81e117  // sdot v23.4s, v8.16b, v1.4b[0]\n"
+      "50:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 36b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbnz %x[flags], #31, 51f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v2.4s }, [x19]\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "neg v2.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v2.4s\n"
+      "mul v12.4s, v12.4s, v2.4s\n"
+      "51:"  // Height 2: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "add v20.4s, v20.4s, v12.4s\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v22.4s, v22.4s, v12.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add v23.4s, v23.4s, v12.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "tbz %x[flags], #5, 52f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "and v8.16b, v20.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v9.16b, v21.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v8.4s\n"
+      "sqadd v21.4s, v21.4s, v9.4s\n"
+      "sqadd v22.4s, v22.4s, v10.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "52:"  // Height 2: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "bge 61f\n"
+      "tbz x12, #3, 56f\n"
+      "str d16, [x9], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x12, #2, 54f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "tbz x12, #1, 53f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "b 60f\n"
+      "53:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "b 60f\n"
+      "54:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 55f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "b 60f\n"
+      "55:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "b 60f\n"
+      "56:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 58f\n"
+      "str s16, [x9], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "tbz x12, #1, 57f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "b 60f\n"
+      "57:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "b 60f\n"
+      "58:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 59f\n"
+      "str h16, [x9], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "b 60f\n"
+      "59:"  // Height 2: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "60:"  // Height 2: Partial direct writeback: Done
+      "b 62f\n"
+      "61:"  // Height 2: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "62:"  // Height 2: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 34b\n"
+      "b 126f\n"
+      "63:"  // Height 3
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 64f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "b 65f\n"
+      "64:"  // Height 3: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "65:"  // Height 3: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "66:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "67:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 68f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x28, 69f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 69f\n"
+      "68:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "69:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "blt 74f\n"
+      "cmp x27, #0x20\n"
+      "blt 72f\n"
+      "70:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x20]\n"
+      ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
+      "ldr q7, [x11, #0x30]\n"
+      "ldr q8, [x11, #0x40]\n"
+      ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x80]\n"
+      ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x4f82e91b  // sdot v27.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e938  // sdot v24.4s, v9.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e959  // sdot v25.4s, v10.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e89a  // sdot v26.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8bb  // sdot v27.4s, v5.16b, v2.4b[3]\n"
+      "tbnz %x[flags], #31, 71f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "71:"  // Height 3: Multiply loop: unique 9: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "bge 70b\n"
+      "72:"  // Height 3: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x20]\n"
+      ".inst 0x4f82e0d8  // sdot v24.4s, v6.16b, v2.4b[0]\n"
+      "ldr q9, [x11, #0x30]\n"
+      "ldr q10, [x11, #0x40]\n"
+      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      ".inst 0x4f82e0f9  // sdot v25.4s, v7.16b, v2.4b[0]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x80]\n"
+      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f82e11a  // sdot v26.4s, v8.16b, v2.4b[0]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e13b  // sdot v27.4s, v9.16b, v2.4b[0]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x4fa0e150  // sdot v16.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e154  // sdot v20.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e158  // sdot v24.4s, v10.16b, v2.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      ".inst 0x4fa0e091  // sdot v17.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e095  // sdot v21.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e099  // sdot v25.4s, v4.16b, v2.4b[1]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x4fa0e0b2  // sdot v18.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0b6  // sdot v22.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0ba  // sdot v26.4s, v5.16b, v2.4b[1]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      ".inst 0x4fa0e0d3  // sdot v19.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0d7  // sdot v23.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0db  // sdot v27.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x4f80e8f0  // sdot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8f4  // sdot v20.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f8  // sdot v24.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      ".inst 0x4f80e911  // sdot v17.4s, v8.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4f81e915  // sdot v21.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x4f82e919  // sdot v25.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x4f80e932  // sdot v18.4s, v9.16b, v0.4b[2]\n"
+      ".inst 0x4f81e936  // sdot v22.4s, v9.16b, v1.4b[2]\n"
+      ".inst 0x4f82e93a  // sdot v26.4s, v9.16b, v2.4b[2]\n"
+      ".inst 0x4f80e953  // sdot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x4f81e957  // sdot v23.4s, v10.16b, v1.4b[2]\n"
+      ".inst 0x4f82e95b  // sdot v27.4s, v10.16b, v2.4b[2]\n"
+      ".inst 0x4fa0e890  // sdot v16.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e894  // sdot v20.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e898  // sdot v24.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8b1  // sdot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8b5  // sdot v21.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8b9  // sdot v25.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8d2  // sdot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8d6  // sdot v22.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8da  // sdot v26.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8f3  // sdot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8f7  // sdot v23.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8fb  // sdot v27.4s, v7.16b, v2.4b[3]\n"
+      "tbnz %x[flags], #31, 73f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "73:"  // Height 3: Multiply loop: unique 10: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "74:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 81f\n"
+      "cmp x27, #0x4\n"
+      "blt 77f\n"
+      "75:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "tbnz %x[flags], #31, 76f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "76:"  // Height 3: Multiply loop: unique 11: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x4f80e110  // sdot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      ".inst 0x4f81e114  // sdot v20.4s, v8.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x20]\n"
+      ".inst 0x4f82e118  // sdot v24.4s, v8.16b, v2.4b[0]\n"
+      "ldr q4, [x11, #0x30]\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x4f80e131  // sdot v17.4s, v9.16b, v0.4b[0]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x4f81e135  // sdot v21.4s, v9.16b, v1.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f82e139  // sdot v25.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x4f80e152  // sdot v18.4s, v10.16b, v0.4b[0]\n"
+      ".inst 0x4f81e156  // sdot v22.4s, v10.16b, v1.4b[0]\n"
+      ".inst 0x4f82e15a  // sdot v26.4s, v10.16b, v2.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4f82e09b  // sdot v27.4s, v4.16b, v2.4b[0]\n"
+      "bge 75b\n"
+      "cbz x27, 81f\n"
+      "77:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 78f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "tbz x27, #0, 79f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "b 79f\n"
+      "78:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "79:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 80f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "80:"  // Height 3: Multiply loop: unique 12: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x4f80e0b0  // sdot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x4f81e0b4  // sdot v20.4s, v5.16b, v1.4b[0]\n"
+      "ldr q7, [x11, #0x20]\n"
+      ".inst 0x4f82e0b8  // sdot v24.4s, v5.16b, v2.4b[0]\n"
+      "ldr q8, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d9  // sdot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f80e0f2  // sdot v18.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0f6  // sdot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0fa  // sdot v26.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f80e113  // sdot v19.4s, v8.16b, v0.4b[0]\n"
+      ".inst 0x4f81e117  // sdot v23.4s, v8.16b, v1.4b[0]\n"
+      ".inst 0x4f82e11b  // sdot v27.4s, v8.16b, v2.4b[0]\n"
+      "81:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 67b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbnz %x[flags], #31, 82f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v3.4s }, [x19]\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "neg v3.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v3.4s\n"
+      "mul v12.4s, v12.4s, v3.4s\n"
+      "mul v13.4s, v13.4s, v3.4s\n"
+      "82:"  // Height 3: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "add v20.4s, v20.4s, v12.4s\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v22.4s, v22.4s, v12.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add v23.4s, v23.4s, v12.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v24.4s, v24.4s, v13.4s\n"
+      "add v25.4s, v25.4s, v13.4s\n"
+      "add v26.4s, v26.4s, v13.4s\n"
+      "add v27.4s, v27.4s, v13.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "tbz %x[flags], #5, 83f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "and v8.16b, v20.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v9.16b, v21.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v8.4s\n"
+      "sqadd v21.4s, v21.4s, v9.4s\n"
+      "sqadd v22.4s, v22.4s, v10.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v7.16b, v26.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v8.16b, v27.16b, v0.16b\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v26.4s, v26.4s, v7.4s\n"
+      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "83:"  // Height 3: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "bge 92f\n"
+      "tbz x12, #3, 87f\n"
+      "str d16, [x9], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x12, #2, 85f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "tbz x12, #1, 84f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "st1 { v24.h }[6], [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "st1 { v24.b }[14], [x23]\n"
+      "b 91f\n"
+      "84:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "st1 { v24.b }[12], [x23]\n"
+      "b 91f\n"
+      "85:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 86f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "st1 { v24.h }[4], [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "st1 { v24.b }[10], [x23]\n"
+      "b 91f\n"
+      "86:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "st1 { v24.b }[8], [x23]\n"
+      "b 91f\n"
+      "87:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 89f\n"
+      "str s16, [x9], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "tbz x12, #1, 88f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "st1 { v24.h }[2], [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "st1 { v24.b }[6], [x23]\n"
+      "b 91f\n"
+      "88:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "st1 { v24.b }[4], [x23]\n"
+      "b 91f\n"
+      "89:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 90f\n"
+      "str h16, [x9], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "str h24, [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "st1 { v24.b }[2], [x23]\n"
+      "b 91f\n"
+      "90:"  // Height 3: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "str b24, [x23, #0x0]\n"
+      "91:"  // Height 3: Partial direct writeback: Done
+      "b 93f\n"
+      "92:"  // Height 3: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q24, [x23, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "93:"  // Height 3: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 65b\n"
+      "b 126f\n"
+      "94:"  // Height 4
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 95f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "ldr x21, [%x[output_ptr], #0x18]\n"
+      "add x25, x25, x19\n"
+      "add %x[output_ptr], %x[output_ptr], #0x20\n"
+      "add x23, x23, x19\n"
+      "add x21, x21, x19\n"
+      "b 96f\n"
+      "95:"  // Height 4: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "add x21, x23, x19\n"
+      "add %x[output_ptr], x21, x19\n"
+      "96:"  // Height 4: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "97:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "98:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 99f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x28, 100f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 100f\n"
+      "99:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "100:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "blt 105f\n"
+      "cmp x27, #0x20\n"
+      "blt 103f\n"
+      "101:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x20, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x20]\n"
+      ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
+      "ldr q7, [x11, #0x30]\n"
+      ".inst 0x4f83e09c  // sdot v28.4s, v4.16b, v3.4b[0]\n"
+      "ldr q8, [x11, #0x40]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f83e0bd  // sdot v29.4s, v5.16b, v3.4b[0]\n"
+      "ldr q5, [x11, #0x80]\n"
+      ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f83e0de  // sdot v30.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0ff  // sdot v31.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e11c  // sdot v28.4s, v8.16b, v3.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e13d  // sdot v29.4s, v9.16b, v3.4b[1]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e15e  // sdot v30.4s, v10.16b, v3.4b[1]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e09f  // sdot v31.4s, v4.16b, v3.4b[1]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8bc  // sdot v28.4s, v5.16b, v3.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8dd  // sdot v29.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8fe  // sdot v30.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x4f82e91b  // sdot v27.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x4f83e91f  // sdot v31.4s, v8.16b, v3.4b[2]\n"
+      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e938  // sdot v24.4s, v9.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e93c  // sdot v28.4s, v9.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e959  // sdot v25.4s, v10.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e95d  // sdot v29.4s, v10.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e89a  // sdot v26.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e89e  // sdot v30.4s, v4.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8bb  // sdot v27.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8bf  // sdot v31.4s, v5.16b, v3.4b[3]\n"
+      "tbnz %x[flags], #31, 102f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
+      "102:"  // Height 4: Multiply loop: unique 13: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bge 101b\n"
+      "103:"  // Height 4: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x20, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x20]\n"
+      ".inst 0x4f82e0d8  // sdot v24.4s, v6.16b, v2.4b[0]\n"
+      "ldr q9, [x11, #0x30]\n"
+      ".inst 0x4f83e0dc  // sdot v28.4s, v6.16b, v3.4b[0]\n"
+      "ldr q10, [x11, #0x40]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x4f82e0f9  // sdot v25.4s, v7.16b, v2.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f83e0fd  // sdot v29.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x11, #0x80]\n"
+      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e11a  // sdot v26.4s, v8.16b, v2.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f83e11e  // sdot v30.4s, v8.16b, v3.4b[0]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
+      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
+      ".inst 0x4f82e13b  // sdot v27.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x4f83e13f  // sdot v31.4s, v9.16b, v3.4b[0]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x4fa0e150  // sdot v16.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e154  // sdot v20.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e158  // sdot v24.4s, v10.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e15c  // sdot v28.4s, v10.16b, v3.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      ".inst 0x4fa0e091  // sdot v17.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e095  // sdot v21.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e099  // sdot v25.4s, v4.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e09d  // sdot v29.4s, v4.16b, v3.4b[1]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x4fa0e0b2  // sdot v18.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0b6  // sdot v22.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0ba  // sdot v26.4s, v5.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0be  // sdot v30.4s, v5.16b, v3.4b[1]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      ".inst 0x4fa0e0d3  // sdot v19.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0d7  // sdot v23.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0db  // sdot v27.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0df  // sdot v31.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x4f80e8f0  // sdot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8f4  // sdot v20.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f8  // sdot v24.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8fc  // sdot v28.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      ".inst 0x4f80e911  // sdot v17.4s, v8.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4f81e915  // sdot v21.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x4f82e919  // sdot v25.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x4f83e91d  // sdot v29.4s, v8.16b, v3.4b[2]\n"
+      ".inst 0x4f80e932  // sdot v18.4s, v9.16b, v0.4b[2]\n"
+      ".inst 0x4f81e936  // sdot v22.4s, v9.16b, v1.4b[2]\n"
+      ".inst 0x4f82e93a  // sdot v26.4s, v9.16b, v2.4b[2]\n"
+      ".inst 0x4f83e93e  // sdot v30.4s, v9.16b, v3.4b[2]\n"
+      ".inst 0x4f80e953  // sdot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x4f81e957  // sdot v23.4s, v10.16b, v1.4b[2]\n"
+      ".inst 0x4f82e95b  // sdot v27.4s, v10.16b, v2.4b[2]\n"
+      ".inst 0x4f83e95f  // sdot v31.4s, v10.16b, v3.4b[2]\n"
+      ".inst 0x4fa0e890  // sdot v16.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e894  // sdot v20.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e898  // sdot v24.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e89c  // sdot v28.4s, v4.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8b1  // sdot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8b5  // sdot v21.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8b9  // sdot v25.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8bd  // sdot v29.4s, v5.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8d2  // sdot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8d6  // sdot v22.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8da  // sdot v26.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8de  // sdot v30.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8f3  // sdot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8f7  // sdot v23.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8fb  // sdot v27.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8ff  // sdot v31.4s, v7.16b, v3.4b[3]\n"
+      "tbnz %x[flags], #31, 104f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
+      "104:"  // Height 4: Multiply loop: unique 14: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "105:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 112f\n"
+      "cmp x27, #0x4\n"
+      "blt 108f\n"
+      "106:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "ldr s3, [x20], #0x4\n"
+      "tbnz %x[flags], #31, 107f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
+      "107:"  // Height 4: Multiply loop: unique 15: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x4f80e110  // sdot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      ".inst 0x4f81e114  // sdot v20.4s, v8.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x20]\n"
+      ".inst 0x4f82e118  // sdot v24.4s, v8.16b, v2.4b[0]\n"
+      "ldr q4, [x11, #0x30]\n"
+      ".inst 0x4f83e11c  // sdot v28.4s, v8.16b, v3.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f80e131  // sdot v17.4s, v9.16b, v0.4b[0]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x4f81e135  // sdot v21.4s, v9.16b, v1.4b[0]\n"
+      ".inst 0x4f82e139  // sdot v25.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x4f83e13d  // sdot v29.4s, v9.16b, v3.4b[0]\n"
+      ".inst 0x4f80e152  // sdot v18.4s, v10.16b, v0.4b[0]\n"
+      ".inst 0x4f81e156  // sdot v22.4s, v10.16b, v1.4b[0]\n"
+      ".inst 0x4f82e15a  // sdot v26.4s, v10.16b, v2.4b[0]\n"
+      ".inst 0x4f83e15e  // sdot v30.4s, v10.16b, v3.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4f82e09b  // sdot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x4f83e09f  // sdot v31.4s, v4.16b, v3.4b[0]\n"
+      "bge 106b\n"
+      "cbz x27, 112f\n"
+      "108:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 109f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "ldr h3, [x20], #0x2\n"
+      "tbz x27, #0, 110f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "ld1 { v3.b }[2], [x20]\n"
+      "b 110f\n"
+      "109:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "ldr b3, [x20, #0x0]\n"
+      "110:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 111f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
+      "111:"  // Height 4: Multiply loop: unique 16: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x4f80e0b0  // sdot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x4f81e0b4  // sdot v20.4s, v5.16b, v1.4b[0]\n"
+      "ldr q7, [x11, #0x20]\n"
+      ".inst 0x4f82e0b8  // sdot v24.4s, v5.16b, v2.4b[0]\n"
+      "ldr q8, [x11, #0x30]\n"
+      ".inst 0x4f83e0bc  // sdot v28.4s, v5.16b, v3.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d9  // sdot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0dd  // sdot v29.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0f2  // sdot v18.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0f6  // sdot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0fa  // sdot v26.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0fe  // sdot v30.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f80e113  // sdot v19.4s, v8.16b, v0.4b[0]\n"
+      ".inst 0x4f81e117  // sdot v23.4s, v8.16b, v1.4b[0]\n"
+      ".inst 0x4f82e11b  // sdot v27.4s, v8.16b, v2.4b[0]\n"
+      ".inst 0x4f83e11f  // sdot v31.4s, v8.16b, v3.4b[0]\n"
+      "112:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 98b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbnz %x[flags], #31, 113f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "addp v14.4s, v14.4s, v14.4s\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "addp v14.4s, v14.4s, v14.4s\n"
+      "neg v4.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v4.4s\n"
+      "mul v12.4s, v12.4s, v4.4s\n"
+      "mul v13.4s, v13.4s, v4.4s\n"
+      "mul v14.4s, v14.4s, v4.4s\n"
+      "113:"  // Height 4: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "add v20.4s, v20.4s, v12.4s\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v22.4s, v22.4s, v12.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add v23.4s, v23.4s, v12.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v24.4s, v24.4s, v13.4s\n"
+      "add v25.4s, v25.4s, v13.4s\n"
+      "add v26.4s, v26.4s, v13.4s\n"
+      "add v27.4s, v27.4s, v13.4s\n"
+      "add v28.4s, v28.4s, v14.4s\n"
+      "add v29.4s, v29.4s, v14.4s\n"
+      "add v30.4s, v30.4s, v14.4s\n"
+      "add v31.4s, v31.4s, v14.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v29.4s, v29.4s, v1.4s\n"
+      "add v30.4s, v30.4s, v2.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "tbz %x[flags], #5, 114f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "and v8.16b, v20.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v9.16b, v21.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v8.4s\n"
+      "sqadd v21.4s, v21.4s, v9.4s\n"
+      "sqadd v22.4s, v22.4s, v10.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v7.16b, v26.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v8.16b, v27.16b, v0.16b\n"
+      "and v9.16b, v28.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "and v10.16b, v29.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "and v4.16b, v30.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v26.4s, v26.4s, v7.4s\n"
+      "and v5.16b, v31.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v9.4s\n"
+      "sqadd v29.4s, v29.4s, v10.4s\n"
+      "sqadd v30.4s, v30.4s, v4.4s\n"
+      "sqadd v31.4s, v31.4s, v5.4s\n"
+      "114:"  // Height 4: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "srshl v28.4s, v28.4s, v0.4s\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "srshl v29.4s, v29.4s, v0.4s\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "srshl v30.4s, v30.4s, v0.4s\n"
+      "smin v28.4s, v28.4s, v6.4s\n"
+      "smin v29.4s, v29.4s, v6.4s\n"
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "smax v28.4s, v28.4s, v5.4s\n"
+      "smax v29.4s, v29.4s, v5.4s\n"
+      "add v30.4s, v30.4s, v4.4s\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v30.4s, v30.4s, v6.4s\n"
+      "smin v31.4s, v31.4s, v6.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smax v30.4s, v30.4s, v5.4s\n"
+      "smax v31.4s, v31.4s, v5.4s\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v28.8h, v28.8h, v29.8h\n"
+      "uzp1 v29.8h, v30.8h, v31.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "bge 123f\n"
+      "tbz x12, #3, 118f\n"
+      "str d16, [x9], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x12, #2, 116f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x12, #1, 115f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "st1 { v24.h }[6], [x23], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "st1 { v24.b }[14], [x23]\n"
+      "st1 { v28.b }[14], [x21]\n"
+      "b 122f\n"
+      "115:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "st1 { v24.b }[12], [x23]\n"
+      "st1 { v28.b }[12], [x21]\n"
+      "b 122f\n"
+      "116:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 117f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "st1 { v24.h }[4], [x23], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "st1 { v24.b }[10], [x23]\n"
+      "st1 { v28.b }[10], [x21]\n"
+      "b 122f\n"
+      "117:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "st1 { v24.b }[8], [x23]\n"
+      "st1 { v28.b }[8], [x21]\n"
+      "b 122f\n"
+      "118:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 120f\n"
+      "str s16, [x9], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x12, #1, 119f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "st1 { v24.h }[2], [x23], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "st1 { v24.b }[6], [x23]\n"
+      "st1 { v28.b }[6], [x21]\n"
+      "b 122f\n"
+      "119:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "st1 { v24.b }[4], [x23]\n"
+      "st1 { v28.b }[4], [x21]\n"
+      "b 122f\n"
+      "120:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 121f\n"
+      "str h16, [x9], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "str h24, [x23], #0x2\n"
+      "str h28, [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "st1 { v24.b }[2], [x23]\n"
+      "st1 { v28.b }[2], [x21]\n"
+      "b 122f\n"
+      "121:"  // Height 4: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "str b24, [x23, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
+      "122:"  // Height 4: Partial direct writeback: Done
+      "b 124f\n"
+      "123:"  // Height 4: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x21, x21, #0x10\n"
+      "124:"  // Height 4: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 96b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 126f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 125f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "125:"  // Update direct input
+      "mov x19, #0x4\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "126:"  // Exit
+
+      : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
new file mode 100644
index 0000000000..6d4f3b2efe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<int8_t>, \
+   size_t, size_t, \
+   const int8_t *, \
+   IndirectOutputArg<int8_t>, \
+   const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_s8qs_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_s8qs_dot_6x16
+{
+public:
+    typedef int8_t operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_s8qs_dot_6x16;
+
+    cls_a64_hybrid_s8qs_dot_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..0e98ab8347
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
@@ -0,0 +1,3613 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qs_dot_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+    struct KernelArgs {
+        const int32_t *multiplier_ptr = {};
+        const int32_t *shift_ptr = {};
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->per_channel_requant) {
+        flags |= 0x10;
+        ka.multiplier_ptr=qp->per_channel_muls + col_base;
+        ka.shift_ptr=qp->per_channel_right_shifts + col_base;
+    }
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 141f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 113f\n"
+      "beq 85f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 57f\n"
+      "beq 29f\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x11, #0x10\n"
+      "blt 10f\n"
+      "cmp x11, #0x20\n"
+      "blt 9f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "bge 8b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "10:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x11, 15f\n"
+      "cmp x11, #0x4\n"
+      "blt 12f\n"
+      "11:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "cmp x11, #0x4\n"
+      "bge 11b\n"
+      "cbz x11, 15f\n"
+      "12:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 13f\n"
+      "ldr h0, [x10], #0x2\n"
+      "tbz x11, #0, 14f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "b 14f\n"
+      "13:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "14:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "15:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 5b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "ldr q0, [x16, #0x0]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "ldr q1, [x16, #0x10]\n"
+      "ldr q2, [x16, #0x20]\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "ldr q3, [x16, #0x30]\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add x16, x16, #0x40\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "tbz %x[flags], #4, 16f\n"
+      "ldr q0, [x17, #0x0]\n"
+      "ldr q4, [x8, #0x0]\n"
+      "ldr q1, [x17, #0x10]\n"
+      "ldr q5, [x8, #0x10]\n"
+      "ldr q2, [x17, #0x20]\n"
+      "ldr q6, [x8, #0x20]\n"
+      "ldr q3, [x17, #0x30]\n"
+      "ldr q7, [x8, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "add x8, x8, #0x40\n"
+      "b 17f\n"
+      "16:"  // Height 1: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "17:"  // Height 1: parameters loaded
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "tbz %x[flags], #5, 18f\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "18:"  // Height 1: no shift correction
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x15, #0x10\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "bge 27f\n"
+      "tbz x15, #3, 22f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x15, #2, 20f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "tbz x15, #1, 19f\n"
+      "st1 { v8.h }[6], [x13], #0x2\n"
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[14], [x13]\n"
+      "b 26f\n"
+      "19:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[12], [x13]\n"
+      "b 26f\n"
+      "20:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 21f\n"
+      "st1 { v8.h }[4], [x13], #0x2\n"
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[10], [x13]\n"
+      "b 26f\n"
+      "21:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[8], [x13]\n"
+      "b 26f\n"
+      "22:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 24f\n"
+      "str s8, [x13], #0x4\n"
+      "tbz x15, #1, 23f\n"
+      "st1 { v8.h }[2], [x13], #0x2\n"
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[6], [x13]\n"
+      "b 26f\n"
+      "23:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[4], [x13]\n"
+      "b 26f\n"
+      "24:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 25f\n"
+      "str h8, [x13], #0x2\n"
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[2], [x13]\n"
+      "b 26f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_1_0
+      "str b8, [x13, #0x0]\n"
+      "26:"  // Height 1: Partial direct writeback: Done
+      "b 28f\n"
+      "27:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "28:"  // Height 1: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 3b\n"
+      "b 170f\n"
+      "29:"  // Height 2
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 30f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "b 31f\n"
+      "30:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "31:"  // Height 2: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "32:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "33:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 34f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 35f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "b 35f\n"
+      "34:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "35:"  // Height 2: input setup done
+      "cmp x11, #0x10\n"
+      "blt 38f\n"
+      "cmp x11, #0x20\n"
+      "blt 37f\n"
+      "36:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      "bge 36b\n"
+      "37:"  // Height 2: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      "38:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x11, 43f\n"
+      "cmp x11, #0x4\n"
+      "blt 40f\n"
+      "39:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "bge 39b\n"
+      "cbz x11, 43f\n"
+      "40:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 41f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "tbz x11, #0, 42f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "b 42f\n"
+      "41:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "42:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "43:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 33b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "ldr q0, [x16, #0x0]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "ldr q1, [x16, #0x10]\n"
+      "add v12.4s, v12.4s, v0.4s\n"
+      "ldr q2, [x16, #0x20]\n"
+      "ldr q3, [x16, #0x30]\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add x16, x16, #0x40\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v14.4s, v14.4s, v2.4s\n"
+      "add v15.4s, v15.4s, v3.4s\n"
+      "tbz %x[flags], #4, 44f\n"
+      "ldr q0, [x17, #0x0]\n"
+      "ldr q4, [x8, #0x0]\n"
+      "ldr q1, [x17, #0x10]\n"
+      "ldr q5, [x8, #0x10]\n"
+      "ldr q2, [x17, #0x20]\n"
+      "ldr q6, [x8, #0x20]\n"
+      "ldr q3, [x17, #0x30]\n"
+      "ldr q7, [x8, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "add x8, x8, #0x40\n"
+      "b 45f\n"
+      "44:"  // Height 2: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "45:"  // Height 2: parameters loaded
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+      "tbz %x[flags], #5, 46f\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v4.16b, v12.16b, v0.16b\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v4.4s\n"
+      "and v7.16b, v15.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v13.4s, v13.4s, v5.4s\n"
+      "sqadd v14.4s, v14.4s, v6.4s\n"
+      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "46:"  // Height 2: no shift correction
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x15, #0x10\n"
+      "srshl v12.4s, v12.4s, v0.4s\n"
+      "srshl v13.4s, v13.4s, v1.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "uzp1 v12.8h, v12.8h, v13.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "bge 55f\n"
+      "tbz x15, #3, 50f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "tbz x15, #2, 48f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "tbz x15, #1, 47f\n"
+      "st1 { v8.h }[6], [x13], #0x2\n"
+      "st1 { v12.h }[6], [x9], #0x2\n"
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[14], [x13]\n"
+      "st1 { v12.b }[14], [x9]\n"
+      "b 54f\n"
+      "47:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[12], [x13]\n"
+      "st1 { v12.b }[12], [x9]\n"
+      "b 54f\n"
+      "48:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 49f\n"
+      "st1 { v8.h }[4], [x13], #0x2\n"
+      "st1 { v12.h }[4], [x9], #0x2\n"
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[10], [x13]\n"
+      "st1 { v12.b }[10], [x9]\n"
+      "b 54f\n"
+      "49:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[8], [x13]\n"
+      "st1 { v12.b }[8], [x9]\n"
+      "b 54f\n"
+      "50:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 52f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "tbz x15, #1, 51f\n"
+      "st1 { v8.h }[2], [x13], #0x2\n"
+      "st1 { v12.h }[2], [x9], #0x2\n"
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[6], [x13]\n"
+      "st1 { v12.b }[6], [x9]\n"
+      "b 54f\n"
+      "51:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[4], [x13]\n"
+      "st1 { v12.b }[4], [x9]\n"
+      "b 54f\n"
+      "52:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 53f\n"
+      "str h8, [x13], #0x2\n"
+      "str h12, [x9], #0x2\n"
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[2], [x13]\n"
+      "st1 { v12.b }[2], [x9]\n"
+      "b 54f\n"
+      "53:"  // Height 2: Partial direct writeback: partial_1_0
+      "str b8, [x13, #0x0]\n"
+      "str b12, [x9, #0x0]\n"
+      "54:"  // Height 2: Partial direct writeback: Done
+      "b 56f\n"
+      "55:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q12, [x9, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "56:"  // Height 2: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 31b\n"
+      "b 170f\n"
+      "57:"  // Height 3
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 58f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "add x27, x27, x19\n"
+      "b 59f\n"
+      "58:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "59:"  // Height 3: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "60:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "61:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 62f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 63f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "b 63f\n"
+      "62:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "63:"  // Height 3: input setup done
+      "cmp x11, #0x10\n"
+      "blt 66f\n"
+      "cmp x11, #0x20\n"
+      "blt 65f\n"
+      "64:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      "bge 64b\n"
+      "65:"  // Height 3: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      "66:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x11, 71f\n"
+      "cmp x11, #0x4\n"
+      "blt 68f\n"
+      "67:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "bge 67b\n"
+      "cbz x11, 71f\n"
+      "68:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 69f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "tbz x11, #0, 70f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "b 70f\n"
+      "69:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "70:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "71:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 61b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "ldr q0, [x16, #0x0]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "ldr q1, [x16, #0x10]\n"
+      "add v12.4s, v12.4s, v0.4s\n"
+      "ldr q2, [x16, #0x20]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "ldr q3, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v14.4s, v14.4s, v2.4s\n"
+      "add v15.4s, v15.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "tbz %x[flags], #4, 72f\n"
+      "ldr q0, [x17, #0x0]\n"
+      "ldr q4, [x8, #0x0]\n"
+      "ldr q1, [x17, #0x10]\n"
+      "ldr q5, [x8, #0x10]\n"
+      "ldr q2, [x17, #0x20]\n"
+      "ldr q6, [x8, #0x20]\n"
+      "ldr q3, [x17, #0x30]\n"
+      "ldr q7, [x8, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "add x8, x8, #0x40\n"
+      "b 73f\n"
+      "72:"  // Height 3: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "73:"  // Height 3: parameters loaded
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "tbz %x[flags], #5, 74f\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v4.16b, v12.16b, v0.16b\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v4.4s\n"
+      "and v7.16b, v15.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v13.4s, v13.4s, v5.4s\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v14.4s, v14.4s, v6.4s\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "and v6.16b, v18.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "74:"  // Height 3: no shift correction
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x15, #0x10\n"
+      "srshl v12.4s, v12.4s, v0.4s\n"
+      "srshl v13.4s, v13.4s, v1.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "uzp1 v12.8h, v12.8h, v13.8h\n"
+      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 83f\n"
+      "tbz x15, #3, 78f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "tbz x15, #2, 76f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "tbz x15, #1, 75f\n"
+      "st1 { v8.h }[6], [x13], #0x2\n"
+      "st1 { v12.h }[6], [x9], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[14], [x13]\n"
+      "st1 { v12.b }[14], [x9]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "b 82f\n"
+      "75:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[12], [x13]\n"
+      "st1 { v12.b }[12], [x9]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "b 82f\n"
+      "76:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 77f\n"
+      "st1 { v8.h }[4], [x13], #0x2\n"
+      "st1 { v12.h }[4], [x9], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[10], [x13]\n"
+      "st1 { v12.b }[10], [x9]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "b 82f\n"
+      "77:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[8], [x13]\n"
+      "st1 { v12.b }[8], [x9]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "b 82f\n"
+      "78:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 80f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "tbz x15, #1, 79f\n"
+      "st1 { v8.h }[2], [x13], #0x2\n"
+      "st1 { v12.h }[2], [x9], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[6], [x13]\n"
+      "st1 { v12.b }[6], [x9]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "b 82f\n"
+      "79:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[4], [x13]\n"
+      "st1 { v12.b }[4], [x9]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "b 82f\n"
+      "80:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 81f\n"
+      "str h8, [x13], #0x2\n"
+      "str h12, [x9], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[2], [x13]\n"
+      "st1 { v12.b }[2], [x9]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "b 82f\n"
+      "81:"  // Height 3: Partial direct writeback: partial_1_0
+      "str b8, [x13, #0x0]\n"
+      "str b12, [x9, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "82:"  // Height 3: Partial direct writeback: Done
+      "b 84f\n"
+      "83:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "84:"  // Height 3: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 59b\n"
+      "b 170f\n"
+      "85:"  // Height 4
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 86f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19\n"
+      "add x25, x25, x19\n"
+      "b 87f\n"
+      "86:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "add x25, x27, x19\n"
+      "87:"  // Height 4: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "88:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "89:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 90f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 91f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 91f\n"
+      "90:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "91:"  // Height 4: input setup done
+      "cmp x11, #0x10\n"
+      "blt 94f\n"
+      "cmp x11, #0x20\n"
+      "blt 93f\n"
+      "92:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      "bge 92b\n"
+      "93:"  // Height 4: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      "94:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x11, 99f\n"
+      "cmp x11, #0x4\n"
+      "blt 96f\n"
+      "95:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "bge 95b\n"
+      "cbz x11, 99f\n"
+      "96:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 97f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "tbz x11, #0, 98f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "b 98f\n"
+      "97:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "98:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "99:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 89b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "ldr q0, [x16, #0x0]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "ldr q1, [x16, #0x10]\n"
+      "add v12.4s, v12.4s, v0.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "ldr q2, [x16, #0x20]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "ldr q3, [x16, #0x30]\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add x16, x16, #0x40\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v14.4s, v14.4s, v2.4s\n"
+      "add v15.4s, v15.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "tbz %x[flags], #4, 100f\n"
+      "ldr q0, [x17, #0x0]\n"
+      "ldr q4, [x8, #0x0]\n"
+      "ldr q1, [x17, #0x10]\n"
+      "ldr q5, [x8, #0x10]\n"
+      "ldr q2, [x17, #0x20]\n"
+      "ldr q6, [x8, #0x20]\n"
+      "ldr q3, [x17, #0x30]\n"
+      "ldr q7, [x8, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "add x8, x8, #0x40\n"
+      "b 101f\n"
+      "100:"  // Height 4: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "101:"  // Height 4: parameters loaded
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+      "tbz %x[flags], #5, 102f\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v4.16b, v12.16b, v0.16b\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v4.4s\n"
+      "and v7.16b, v15.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v13.4s, v13.4s, v5.4s\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v14.4s, v14.4s, v6.4s\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "and v6.16b, v18.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "and v4.16b, v20.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "and v5.16b, v21.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v6.16b, v22.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v4.4s\n"
+      "and v7.16b, v23.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v5.4s\n"
+      "sqadd v22.4s, v22.4s, v6.4s\n"
+      "sqadd v23.4s, v23.4s, v7.4s\n"
+      "102:"  // Height 4: no shift correction
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x15, #0x10\n"
+      "srshl v12.4s, v12.4s, v0.4s\n"
+      "srshl v13.4s, v13.4s, v1.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "srshl v21.4s, v21.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "srshl v22.4s, v22.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "srshl v23.4s, v23.4s, v3.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "uzp1 v12.8h, v12.8h, v13.8h\n"
+      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "bge 111f\n"
+      "tbz x15, #3, 106f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x15, #2, 104f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "tbz x15, #1, 103f\n"
+      "st1 { v8.h }[6], [x13], #0x2\n"
+      "st1 { v12.h }[6], [x9], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[14], [x13]\n"
+      "st1 { v12.b }[14], [x9]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "b 110f\n"
+      "103:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[12], [x13]\n"
+      "st1 { v12.b }[12], [x9]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "b 110f\n"
+      "104:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 105f\n"
+      "st1 { v8.h }[4], [x13], #0x2\n"
+      "st1 { v12.h }[4], [x9], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[10], [x13]\n"
+      "st1 { v12.b }[10], [x9]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "b 110f\n"
+      "105:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[8], [x13]\n"
+      "st1 { v12.b }[8], [x9]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "b 110f\n"
+      "106:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 108f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "tbz x15, #1, 107f\n"
+      "st1 { v8.h }[2], [x13], #0x2\n"
+      "st1 { v12.h }[2], [x9], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[6], [x13]\n"
+      "st1 { v12.b }[6], [x9]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "b 110f\n"
+      "107:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[4], [x13]\n"
+      "st1 { v12.b }[4], [x9]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "b 110f\n"
+      "108:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 109f\n"
+      "str h8, [x13], #0x2\n"
+      "str h12, [x9], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[2], [x13]\n"
+      "st1 { v12.b }[2], [x9]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "b 110f\n"
+      "109:"  // Height 4: Partial direct writeback: partial_1_0
+      "str b8, [x13, #0x0]\n"
+      "str b12, [x9, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "110:"  // Height 4: Partial direct writeback: Done
+      "b 112f\n"
+      "111:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "112:"  // Height 4: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 87b\n"
+      "b 170f\n"
+      "113:"  // Height 5
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 114f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "b 115f\n"
+      "114:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "add x25, x27, x19\n"
+      "add x23, x25, x19\n"
+      "115:"  // Height 5: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "116:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "117:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 118f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 119f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 119f\n"
+      "118:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "119:"  // Height 5: input setup done
+      "cmp x11, #0x10\n"
+      "blt 122f\n"
+      "cmp x11, #0x20\n"
+      "blt 121f\n"
+      "120:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      "bge 120b\n"
+      "121:"  // Height 5: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      "122:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x11, 127f\n"
+      "cmp x11, #0x4\n"
+      "blt 124f\n"
+      "123:"  // Height 5: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "bge 123b\n"
+      "cbz x11, 127f\n"
+      "124:"  // Height 5: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 125f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "tbz x11, #0, 126f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "b 126f\n"
+      "125:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "126:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "127:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 117b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "ldr q0, [x16, #0x0]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "ldr q1, [x16, #0x10]\n"
+      "add v12.4s, v12.4s, v0.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "ldr q2, [x16, #0x20]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "ldr q3, [x16, #0x30]\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add x16, x16, #0x40\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v14.4s, v14.4s, v2.4s\n"
+      "add v15.4s, v15.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "tbz %x[flags], #4, 128f\n"
+      "ldr q0, [x17, #0x0]\n"
+      "ldr q4, [x8, #0x0]\n"
+      "ldr q1, [x17, #0x10]\n"
+      "ldr q5, [x8, #0x10]\n"
+      "ldr q2, [x17, #0x20]\n"
+      "ldr q6, [x8, #0x20]\n"
+      "ldr q3, [x17, #0x30]\n"
+      "ldr q7, [x8, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "add x8, x8, #0x40\n"
+      "b 129f\n"
+      "128:"  // Height 5: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "129:"  // Height 5: parameters loaded
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+      "tbz %x[flags], #5, 130f\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v4.16b, v12.16b, v0.16b\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v4.4s\n"
+      "and v7.16b, v15.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v13.4s, v13.4s, v5.4s\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v14.4s, v14.4s, v6.4s\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "and v6.16b, v18.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "and v4.16b, v20.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "and v5.16b, v21.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v6.16b, v22.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v4.4s\n"
+      "and v7.16b, v23.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v5.4s\n"
+      "and v4.16b, v24.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v22.4s, v22.4s, v6.4s\n"
+      "and v5.16b, v25.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v7.4s\n"
+      "and v6.16b, v26.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v4.4s\n"
+      "and v7.16b, v27.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v5.4s\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "sqadd v27.4s, v27.4s, v7.4s\n"
+      "130:"  // Height 5: no shift correction
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x15, #0x10\n"
+      "srshl v12.4s, v12.4s, v0.4s\n"
+      "srshl v13.4s, v13.4s, v1.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "srshl v21.4s, v21.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "srshl v22.4s, v22.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "srshl v23.4s, v23.4s, v3.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "srshl v25.4s, v25.4s, v1.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v2.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v27.4s, v27.4s, v3.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v12.8h, v12.8h, v13.8h\n"
+      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "bge 139f\n"
+      "tbz x15, #3, 134f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x15, #2, 132f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "tbz x15, #1, 131f\n"
+      "st1 { v8.h }[6], [x13], #0x2\n"
+      "st1 { v12.h }[6], [x9], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "st1 { v24.h }[6], [x23], #0x2\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[14], [x13]\n"
+      "st1 { v12.b }[14], [x9]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "st1 { v24.b }[14], [x23]\n"
+      "b 138f\n"
+      "131:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[12], [x13]\n"
+      "st1 { v12.b }[12], [x9]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "st1 { v24.b }[12], [x23]\n"
+      "b 138f\n"
+      "132:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 133f\n"
+      "st1 { v8.h }[4], [x13], #0x2\n"
+      "st1 { v12.h }[4], [x9], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "st1 { v24.h }[4], [x23], #0x2\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[10], [x13]\n"
+      "st1 { v12.b }[10], [x9]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "st1 { v24.b }[10], [x23]\n"
+      "b 138f\n"
+      "133:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[8], [x13]\n"
+      "st1 { v12.b }[8], [x9]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "st1 { v24.b }[8], [x23]\n"
+      "b 138f\n"
+      "134:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 136f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "tbz x15, #1, 135f\n"
+      "st1 { v8.h }[2], [x13], #0x2\n"
+      "st1 { v12.h }[2], [x9], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "st1 { v24.h }[2], [x23], #0x2\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[6], [x13]\n"
+      "st1 { v12.b }[6], [x9]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "st1 { v24.b }[6], [x23]\n"
+      "b 138f\n"
+      "135:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[4], [x13]\n"
+      "st1 { v12.b }[4], [x9]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "st1 { v24.b }[4], [x23]\n"
+      "b 138f\n"
+      "136:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 137f\n"
+      "str h8, [x13], #0x2\n"
+      "str h12, [x9], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "str h24, [x23], #0x2\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[2], [x13]\n"
+      "st1 { v12.b }[2], [x9]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "st1 { v24.b }[2], [x23]\n"
+      "b 138f\n"
+      "137:"  // Height 5: Partial direct writeback: partial_1_0
+      "str b8, [x13, #0x0]\n"
+      "str b12, [x9, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "str b24, [x23, #0x0]\n"
+      "138:"  // Height 5: Partial direct writeback: Done
+      "b 140f\n"
+      "139:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q24, [x23, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "140:"  // Height 5: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 115b\n"
+      "b 170f\n"
+      "141:"  // Height 6
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 142f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "add x21, x21, x19\n"
+      "b 143f\n"
+      "142:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "add x25, x27, x19\n"
+      "add x23, x25, x19\n"
+      "add x21, x23, x19\n"
+      "add %x[output_ptr], x21, x19\n"
+      "143:"  // Height 6: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "144:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "145:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 146f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 147f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 147f\n"
+      "146:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "147:"  // Height 6: input setup done
+      "cmp x11, #0x10\n"
+      "blt 150f\n"
+      "cmp x11, #0x20\n"
+      "blt 149f\n"
+      "148:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8de  // sdot v30.4s, v6.16b, v5.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
+      "bge 148b\n"
+      "149:"  // Height 6: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8de  // sdot v30.4s, v6.16b, v5.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
+      "150:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x11, 155f\n"
+      "cmp x11, #0x4\n"
+      "blt 152f\n"
+      "151:"  // Height 6: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x20], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "bge 151b\n"
+      "cbz x11, 155f\n"
+      "152:"  // Height 6: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 153f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x20], #0x2\n"
+      "tbz x11, #0, 154f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x20]\n"
+      "b 154f\n"
+      "153:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x20, #0x0]\n"
+      "154:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "155:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 145b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "ldr q0, [x16, #0x0]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "ldr q1, [x16, #0x10]\n"
+      "add v12.4s, v12.4s, v0.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "ldr q2, [x16, #0x20]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "ldr q3, [x16, #0x30]\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "add x16, x16, #0x40\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v0.4s\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v14.4s, v14.4s, v2.4s\n"
+      "add v15.4s, v15.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v1.4s\n"
+      "add v30.4s, v30.4s, v2.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "tbz %x[flags], #4, 156f\n"
+      "ldr q0, [x17, #0x0]\n"
+      "ldr q4, [x8, #0x0]\n"
+      "ldr q1, [x17, #0x10]\n"
+      "ldr q5, [x8, #0x10]\n"
+      "ldr q2, [x17, #0x20]\n"
+      "ldr q6, [x8, #0x20]\n"
+      "ldr q3, [x17, #0x30]\n"
+      "ldr q7, [x8, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "add x8, x8, #0x40\n"
+      "b 157f\n"
+      "156:"  // Height 6: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "157:"  // Height 6: parameters loaded
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v5.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v6.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v7.4s\n"
+      "tbz %x[flags], #5, 158f\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v4.16b, v12.16b, v0.16b\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v4.4s\n"
+      "and v7.16b, v15.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v13.4s, v13.4s, v5.4s\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v14.4s, v14.4s, v6.4s\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "and v6.16b, v18.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "and v4.16b, v20.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "and v5.16b, v21.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v6.16b, v22.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v4.4s\n"
+      "and v7.16b, v23.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v5.4s\n"
+      "and v4.16b, v24.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v22.4s, v22.4s, v6.4s\n"
+      "and v5.16b, v25.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v7.4s\n"
+      "and v6.16b, v26.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v4.4s\n"
+      "and v7.16b, v27.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v5.4s\n"
+      "and v4.16b, v28.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "and v5.16b, v29.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v27.4s, v27.4s, v7.4s\n"
+      "and v6.16b, v30.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v4.4s\n"
+      "and v7.16b, v31.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v29.4s, v29.4s, v5.4s\n"
+      "sqadd v30.4s, v30.4s, v6.4s\n"
+      "sqadd v31.4s, v31.4s, v7.4s\n"
+      "158:"  // Height 6: no shift correction
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x15, #0x10\n"
+      "srshl v12.4s, v12.4s, v0.4s\n"
+      "srshl v13.4s, v13.4s, v1.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "srshl v21.4s, v21.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "srshl v22.4s, v22.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "srshl v23.4s, v23.4s, v3.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "srshl v25.4s, v25.4s, v1.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v2.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v27.4s, v27.4s, v3.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "srshl v28.4s, v28.4s, v0.4s\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "srshl v29.4s, v29.4s, v1.4s\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "srshl v30.4s, v30.4s, v2.4s\n"
+      "smin v28.4s, v28.4s, v6.4s\n"
+      "smin v29.4s, v29.4s, v6.4s\n"
+      "srshl v31.4s, v31.4s, v3.4s\n"
+      "smax v28.4s, v28.4s, v5.4s\n"
+      "smax v29.4s, v29.4s, v5.4s\n"
+      "add v30.4s, v30.4s, v4.4s\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smin v30.4s, v30.4s, v6.4s\n"
+      "smin v31.4s, v31.4s, v6.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "smax v30.4s, v30.4s, v5.4s\n"
+      "smax v31.4s, v31.4s, v5.4s\n"
+      "uzp1 v12.8h, v12.8h, v13.8h\n"
+      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v28.8h, v28.8h, v29.8h\n"
+      "uzp1 v29.8h, v30.8h, v31.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "bge 167f\n"
+      "tbz x15, #3, 162f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x15, #2, 160f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x15, #1, 159f\n"
+      "st1 { v8.h }[6], [x13], #0x2\n"
+      "st1 { v12.h }[6], [x9], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "st1 { v24.h }[6], [x23], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[14], [x13]\n"
+      "st1 { v12.b }[14], [x9]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "st1 { v24.b }[14], [x23]\n"
+      "st1 { v28.b }[14], [x21]\n"
+      "b 166f\n"
+      "159:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[12], [x13]\n"
+      "st1 { v12.b }[12], [x9]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "st1 { v24.b }[12], [x23]\n"
+      "st1 { v28.b }[12], [x21]\n"
+      "b 166f\n"
+      "160:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 161f\n"
+      "st1 { v8.h }[4], [x13], #0x2\n"
+      "st1 { v12.h }[4], [x9], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "st1 { v24.h }[4], [x23], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[10], [x13]\n"
+      "st1 { v12.b }[10], [x9]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "st1 { v24.b }[10], [x23]\n"
+      "st1 { v28.b }[10], [x21]\n"
+      "b 166f\n"
+      "161:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[8], [x13]\n"
+      "st1 { v12.b }[8], [x9]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "st1 { v24.b }[8], [x23]\n"
+      "st1 { v28.b }[8], [x21]\n"
+      "b 166f\n"
+      "162:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 164f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x15, #1, 163f\n"
+      "st1 { v8.h }[2], [x13], #0x2\n"
+      "st1 { v12.h }[2], [x9], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "st1 { v24.h }[2], [x23], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[6], [x13]\n"
+      "st1 { v12.b }[6], [x9]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "st1 { v24.b }[6], [x23]\n"
+      "st1 { v28.b }[6], [x21]\n"
+      "b 166f\n"
+      "163:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[4], [x13]\n"
+      "st1 { v12.b }[4], [x9]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "st1 { v24.b }[4], [x23]\n"
+      "st1 { v28.b }[4], [x21]\n"
+      "b 166f\n"
+      "164:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 165f\n"
+      "str h8, [x13], #0x2\n"
+      "str h12, [x9], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "str h24, [x23], #0x2\n"
+      "str h28, [x21], #0x2\n"
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[2], [x13]\n"
+      "st1 { v12.b }[2], [x9]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "st1 { v24.b }[2], [x23]\n"
+      "st1 { v28.b }[2], [x21]\n"
+      "b 166f\n"
+      "165:"  // Height 6: Partial direct writeback: partial_1_0
+      "str b8, [x13, #0x0]\n"
+      "str b12, [x9, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "str b24, [x23, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
+      "166:"  // Height 6: Partial direct writeback: Done
+      "b 168f\n"
+      "167:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x21, x21, #0x10\n"
+      "168:"  // Height 6: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 143b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 170f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 169f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "169:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "170:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
deleted file mode 100644
index 4a7cdc59a7..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
+++ /dev/null
@@ -1,2434 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long blocks_count = K / 4;
-    const long odds_count = K - (blocks_count * 4);
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const int8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(int8_t);
-
-        int32_t *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            long odds = odds_count;
-            const int8_t *a_ptr0 = a_ptr0_base;
-            const int8_t *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            int32_t result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
-            int32_t *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "temploadreg0 .req X0\n"
-                        "temploadreg1 .req X1\n"
-                        "temploadreg2 .req X2\n"
-                        "temploadreg3 .req X3\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "temploadreg0 .req X2\n"
-                        "temploadreg1 .req X3\n"
-                        "temploadreg2 .req X4\n"
-                        "temploadreg3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "temploadreg0 .req X4\n"
-                        "temploadreg1 .req X5\n"
-                        "temploadreg2 .req X6\n"
-                        "temploadreg3 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "temploadreg0 .req X6\n"
-                        "temploadreg1 .req X7\n"
-                        "temploadreg2 .req X8\n"
-                        "temploadreg3 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v28.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr d3, [a_ptr3, #-0x10]\n"
-                        ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[0], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[1], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "ld1 {v3.b}[2], [a_ptr3]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
deleted file mode 100644
index da39a32690..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
+++ /dev/null
@@ -1,1808 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long blocks_count = K / 4;
-    const long odds_count = K - (blocks_count * 4);
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const int8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(int8_t);
-
-        int32_t *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            long odds = odds_count;
-            const int8_t *a_ptr0 = a_ptr0_base;
-            const int8_t *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            int32_t result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
-            int32_t *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v27.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v28.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[0], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[1], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "ld1 {v3.b}[2], [a_ptr3]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
new file mode 100644
index 0000000000..16a6f9213a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<int8_t>, \
+   size_t, size_t, \
+   const int8_t *, \
+   IndirectOutputArg<int32_t>, \
+   const int32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_s8s32_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_s8s32_dot_6x16
+{
+public:
+    typedef int8_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_s8s32_dot_6x16;
+
+    cls_a64_hybrid_s8s32_dot_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..3257986410
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
@@ -0,0 +1,3335 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8s32_dot_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+    const int32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 176f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 141f\n"
+      "beq 106f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 71f\n"
+      "beq 36f\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "tbz %x[flags], #0, 13f\n"
+      "cmp x15, #0x10\n"
+      "bge 12f\n"
+      "tbz x15, #3, 7f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "tbz x15, #2, 5f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 4f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "b 11f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 11f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "b 11f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x15, #1, 6f\n"
+      "ldr d10, [x13], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "b 11f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 11f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "b 11f\n"
+      "7:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x15, #2, 9f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 8f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "b 11f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 11f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "b 11f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x15, #1, 10f\n"
+      "ldr d8, [x13], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "b 11f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "11:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 14f\n"
+      "12:"  // Height 1: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "b 14f\n"
+      "13:"  // Height 1: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "14:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "15:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 16f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 17f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "b 17f\n"
+      "16:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "17:"  // Height 1: input setup done
+      "cmp x11, #0x10\n"
+      "blt 20f\n"
+      "cmp x11, #0x20\n"
+      "blt 19f\n"
+      "18:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "bge 18b\n"
+      "19:"  // Height 1: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "20:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x11, 25f\n"
+      "cmp x11, #0x4\n"
+      "blt 22f\n"
+      "21:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "cmp x11, #0x4\n"
+      "bge 21b\n"
+      "cbz x11, 25f\n"
+      "22:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 23f\n"
+      "ldr h0, [x10], #0x2\n"
+      "tbz x11, #0, 24f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "b 24f\n"
+      "23:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "24:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "25:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 15b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "bge 34f\n"
+      "tbz x15, #3, 29f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "tbz x15, #2, 27f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 26f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "b 33f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 33f\n"
+      "str s11, [x13, #0x0]\n"
+      "b 33f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 28f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "b 33f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 33f\n"
+      "str s10, [x13, #0x0]\n"
+      "b 33f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 31f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 30f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "b 33f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 33f\n"
+      "str s9, [x13, #0x0]\n"
+      "b 33f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 32f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "b 33f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "33:"  // Height 1: Partial direct writeback: Done
+      "b 35f\n"
+      "34:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "35:"  // Height 1: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 3b\n"
+      "b 212f\n"
+      "36:"  // Height 2
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 37f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 38f\n"
+      "37:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "38:"  // Height 2: Column loop
+      "tbz %x[flags], #0, 48f\n"
+      "cmp x15, #0x10\n"
+      "bge 47f\n"
+      "tbz x15, #3, 42f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "tbz x15, #2, 40f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 39f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "b 46f\n"
+      "39:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 46f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "b 46f\n"
+      "40:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x15, #1, 41f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "b 46f\n"
+      "41:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 46f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "b 46f\n"
+      "42:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x15, #2, 44f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 43f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "b 46f\n"
+      "43:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 46f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "b 46f\n"
+      "44:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x15, #1, 45f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "b 46f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "46:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "b 49f\n"
+      "47:"  // Height 2: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "b 49f\n"
+      "48:"  // Height 2: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "49:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "50:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 51f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 52f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "b 52f\n"
+      "51:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "52:"  // Height 2: input setup done
+      "cmp x11, #0x10\n"
+      "blt 55f\n"
+      "cmp x11, #0x20\n"
+      "blt 54f\n"
+      "53:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      "bge 53b\n"
+      "54:"  // Height 2: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      "55:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x11, 60f\n"
+      "cmp x11, #0x4\n"
+      "blt 57f\n"
+      "56:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "bge 56b\n"
+      "cbz x11, 60f\n"
+      "57:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 58f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "tbz x11, #0, 59f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "b 59f\n"
+      "58:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "59:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "60:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 50b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "bge 69f\n"
+      "tbz x15, #3, 64f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "tbz x15, #2, 62f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 61f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "b 68f\n"
+      "61:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 68f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "b 68f\n"
+      "62:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 63f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "b 68f\n"
+      "63:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 68f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "b 68f\n"
+      "64:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 66f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 65f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "b 68f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 68f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "b 68f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 67f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "b 68f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "68:"  // Height 2: Partial direct writeback: Done
+      "b 70f\n"
+      "69:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "70:"  // Height 2: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 38b\n"
+      "b 212f\n"
+      "71:"  // Height 3
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 73f\n"
+      "72:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "73:"  // Height 3: Column loop
+      "tbz %x[flags], #0, 83f\n"
+      "cmp x15, #0x10\n"
+      "bge 82f\n"
+      "tbz x15, #3, 77f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "tbz x15, #2, 75f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 74f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "b 81f\n"
+      "74:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 81f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "b 81f\n"
+      "75:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x15, #1, 76f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "b 81f\n"
+      "76:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 81f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "b 81f\n"
+      "77:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x15, #2, 79f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 78f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "b 81f\n"
+      "78:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 81f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "b 81f\n"
+      "79:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x15, #1, 80f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "b 81f\n"
+      "80:"  // Height 3: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "81:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "b 84f\n"
+      "82:"  // Height 3: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "b 84f\n"
+      "83:"  // Height 3: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "84:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "85:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 86f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 87f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "b 87f\n"
+      "86:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "87:"  // Height 3: input setup done
+      "cmp x11, #0x10\n"
+      "blt 90f\n"
+      "cmp x11, #0x20\n"
+      "blt 89f\n"
+      "88:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      "bge 88b\n"
+      "89:"  // Height 3: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      "90:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x11, 95f\n"
+      "cmp x11, #0x4\n"
+      "blt 92f\n"
+      "91:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "bge 91b\n"
+      "cbz x11, 95f\n"
+      "92:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 93f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "tbz x11, #0, 94f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "b 94f\n"
+      "93:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "94:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "95:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 85b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "bge 104f\n"
+      "tbz x15, #3, 99f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "tbz x15, #2, 97f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 96f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "b 103f\n"
+      "96:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 103f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "b 103f\n"
+      "97:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 98f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "b 103f\n"
+      "98:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 103f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "b 103f\n"
+      "99:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 101f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 100f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "b 103f\n"
+      "100:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 103f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "b 103f\n"
+      "101:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 102f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "b 103f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "103:"  // Height 3: Partial direct writeback: Done
+      "b 105f\n"
+      "104:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "105:"  // Height 3: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 73b\n"
+      "b 212f\n"
+      "106:"  // Height 4
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 107f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 108f\n"
+      "107:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "108:"  // Height 4: Column loop
+      "tbz %x[flags], #0, 118f\n"
+      "cmp x15, #0x10\n"
+      "bge 117f\n"
+      "tbz x15, #3, 112f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "tbz x15, #2, 110f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 109f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "b 116f\n"
+      "109:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 116f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "b 116f\n"
+      "110:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x15, #1, 111f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "b 116f\n"
+      "111:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 116f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "b 116f\n"
+      "112:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x15, #2, 114f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 113f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "b 116f\n"
+      "113:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 116f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "b 116f\n"
+      "114:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x15, #1, 115f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "b 116f\n"
+      "115:"  // Height 4: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "116:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "b 119f\n"
+      "117:"  // Height 4: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "b 119f\n"
+      "118:"  // Height 4: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "119:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "120:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 121f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 122f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 122f\n"
+      "121:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "122:"  // Height 4: input setup done
+      "cmp x11, #0x10\n"
+      "blt 125f\n"
+      "cmp x11, #0x20\n"
+      "blt 124f\n"
+      "123:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      "bge 123b\n"
+      "124:"  // Height 4: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      "125:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x11, 130f\n"
+      "cmp x11, #0x4\n"
+      "blt 127f\n"
+      "126:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "bge 126b\n"
+      "cbz x11, 130f\n"
+      "127:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 128f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "tbz x11, #0, 129f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "b 129f\n"
+      "128:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "129:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "130:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 120b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "bge 139f\n"
+      "tbz x15, #3, 134f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "tbz x15, #2, 132f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 131f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "b 138f\n"
+      "131:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 138f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "b 138f\n"
+      "132:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 133f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "b 138f\n"
+      "133:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 138f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "b 138f\n"
+      "134:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 136f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 135f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "b 138f\n"
+      "135:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 138f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "b 138f\n"
+      "136:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 137f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "b 138f\n"
+      "137:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "138:"  // Height 4: Partial direct writeback: Done
+      "b 140f\n"
+      "139:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "140:"  // Height 4: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 108b\n"
+      "b 212f\n"
+      "141:"  // Height 5
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 142f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 143f\n"
+      "142:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "143:"  // Height 5: Column loop
+      "tbz %x[flags], #0, 153f\n"
+      "cmp x15, #0x10\n"
+      "bge 152f\n"
+      "tbz x15, #3, 147f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "tbz x15, #2, 145f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 144f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "b 151f\n"
+      "144:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 151f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "b 151f\n"
+      "145:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x15, #1, 146f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "b 151f\n"
+      "146:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 151f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "b 151f\n"
+      "147:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x15, #2, 149f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 148f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "b 151f\n"
+      "148:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 151f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "b 151f\n"
+      "149:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x15, #1, 150f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "b 151f\n"
+      "150:"  // Height 5: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "151:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "b 154f\n"
+      "152:"  // Height 5: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "b 154f\n"
+      "153:"  // Height 5: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "154:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "155:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 156f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 157f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 157f\n"
+      "156:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "157:"  // Height 5: input setup done
+      "cmp x11, #0x10\n"
+      "blt 160f\n"
+      "cmp x11, #0x20\n"
+      "blt 159f\n"
+      "158:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      "bge 158b\n"
+      "159:"  // Height 5: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      "160:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x11, 165f\n"
+      "cmp x11, #0x4\n"
+      "blt 162f\n"
+      "161:"  // Height 5: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "bge 161b\n"
+      "cbz x11, 165f\n"
+      "162:"  // Height 5: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 163f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "tbz x11, #0, 164f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "b 164f\n"
+      "163:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "164:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "165:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 155b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "bge 174f\n"
+      "tbz x15, #3, 169f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "tbz x15, #2, 167f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 166f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "b 173f\n"
+      "166:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 173f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "b 173f\n"
+      "167:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 168f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "b 173f\n"
+      "168:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 173f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "b 173f\n"
+      "169:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 171f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 170f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "b 173f\n"
+      "170:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 173f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "b 173f\n"
+      "171:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 172f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "b 173f\n"
+      "172:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "173:"  // Height 5: Partial direct writeback: Done
+      "b 175f\n"
+      "174:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "175:"  // Height 5: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 143b\n"
+      "b 212f\n"
+      "176:"  // Height 6
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 177f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 178f\n"
+      "177:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "178:"  // Height 6: Column loop
+      "tbz %x[flags], #0, 188f\n"
+      "cmp x15, #0x10\n"
+      "bge 187f\n"
+      "tbz x15, #3, 182f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x15, #2, 180f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 179f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 186f\n"
+      "179:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 186f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 186f\n"
+      "180:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x15, #1, 181f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 186f\n"
+      "181:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 186f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 186f\n"
+      "182:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x15, #2, 184f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 183f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 186f\n"
+      "183:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 186f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 186f\n"
+      "184:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x15, #1, 185f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 186f\n"
+      "185:"  // Height 6: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "186:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "sub x21, x21, x19\n"
+      "b 189f\n"
+      "187:"  // Height 6: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "b 189f\n"
+      "188:"  // Height 6: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "189:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "190:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 191f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 192f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 192f\n"
+      "191:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "192:"  // Height 6: input setup done
+      "cmp x11, #0x10\n"
+      "blt 195f\n"
+      "cmp x11, #0x20\n"
+      "blt 194f\n"
+      "193:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8de  // sdot v30.4s, v6.16b, v5.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
+      "bge 193b\n"
+      "194:"  // Height 6: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8de  // sdot v30.4s, v6.16b, v5.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
+      "195:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x11, 200f\n"
+      "cmp x11, #0x4\n"
+      "blt 197f\n"
+      "196:"  // Height 6: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x20], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "bge 196b\n"
+      "cbz x11, 200f\n"
+      "197:"  // Height 6: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 198f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x20], #0x2\n"
+      "tbz x11, #0, 199f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x20]\n"
+      "b 199f\n"
+      "198:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x20, #0x0]\n"
+      "199:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "200:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 190b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "bge 209f\n"
+      "tbz x15, #3, 204f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "tbz x15, #2, 202f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 201f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 208f\n"
+      "201:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 208f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "b 208f\n"
+      "202:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 203f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "b 208f\n"
+      "203:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 208f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "b 208f\n"
+      "204:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 206f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 205f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "b 208f\n"
+      "205:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 208f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "b 208f\n"
+      "206:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 207f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "b 208f\n"
+      "207:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "208:"  // Height 6: Partial direct writeback: Done
+      "b 210f\n"
+      "209:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "add x21, x21, #0x40\n"
+      "210:"  // Height 6: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 178b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 212f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 211f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "211:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "212:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
index e5a88b4519..5b4a7f3e86 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,38 +10,43 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
 
-#include <cstdint>
 #include "../std_transforms_fixed.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<uint8_t>, \
+   size_t, size_t, \
+   const uint8_t *, \
+   IndirectOutputArg<uint8_t>, \
+   const Requantize32 *, const int32_t *, unsigned int
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_hybrid_u8u32_dot_16x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_hybrid_u8qa_dot_4x16( ARGLIST );
 
-class hybrid_u8u32_dot_16x4
+class cls_a64_hybrid_u8qa_dot_4x16
 {
 public:
     typedef uint8_t operand_type;
-    typedef uint32_t result_type;
+    typedef uint8_t result_type;
 
-    typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
@@ -61,32 +66,20 @@ public:
 
     static constexpr bool supports_accumulate()
     {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_activation()
-    {
         return false;
     }
 
     StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_hybrid_u8u32_dot_16x4;
+    kern_type kernel=a64_hybrid_u8qa_dot_4x16;
 
-    hybrid_u8u32_dot_16x4(const CPUInfo *ci)
+    cls_a64_hybrid_u8qa_dot_4x16(const CPUInfo *)
     {
-        if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_hybrid_u8u32_dot_16x4_a55;
-        }
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
new file mode 100644
index 0000000000..ff12472063
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
@@ -0,0 +1,2072 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8qa_dot_4x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 94f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 63f\n"
+      "beq 32f\n"
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "add x9, x9, x19\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "blt 12f\n"
+      "cmp x27, #0x20\n"
+      "blt 10f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      "ldr q6, [x11, #0x20]\n"
+      ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x30]\n"
+      ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q8, [x11, #0x40]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
+      ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
+      "ldr q5, [x11, #0x80]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      "tbnz %x[flags], #31, 9f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "9:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 8b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      "ldr q8, [x11, #0x20]\n"
+      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x30]\n"
+      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x40]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x6fa0e150  // udot v16.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x6fa0e091  // udot v17.4s, v4.16b, v0.4b[1]\n"
+      "ldr q7, [x11, #0x80]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x6fa0e0b2  // udot v18.4s, v5.16b, v0.4b[1]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x6fa0e0d3  // udot v19.4s, v6.16b, v0.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x6f80e8f0  // udot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f80e911  // udot v17.4s, v8.16b, v0.4b[2]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x6f80e932  // udot v18.4s, v9.16b, v0.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f80e953  // udot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e890  // udot v16.4s, v4.16b, v0.4b[3]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6fa0e8b1  // udot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa0e8d2  // udot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa0e8f3  // udot v19.4s, v7.16b, v0.4b[3]\n"
+      "tbnz %x[flags], #31, 11f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "11:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "12:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 19f\n"
+      "cmp x27, #0x4\n"
+      "blt 15f\n"
+      "13:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "tbnz %x[flags], #31, 14f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "14:"  // Height 1: Multiply loop: unique 3: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x6f80e110  // udot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      "ldr q10, [x11, #0x20]\n"
+      ".inst 0x6f80e131  // udot v17.4s, v9.16b, v0.4b[0]\n"
+      "ldr q4, [x11, #0x30]\n"
+      ".inst 0x6f80e152  // udot v18.4s, v10.16b, v0.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      "cmp x27, #0x4\n"
+      "bge 13b\n"
+      "cbz x27, 19f\n"
+      "15:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 16f\n"
+      "ldr h0, [x26], #0x2\n"
+      "tbz x27, #0, 17f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "b 17f\n"
+      "16:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "17:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 18f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "18:"  // Height 1: Multiply loop: unique 4: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x6f80e0b0  // udot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      "ldr q7, [x11, #0x20]\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      "ldr q8, [x11, #0x30]\n"
+      ".inst 0x6f80e0f2  // udot v18.4s, v7.16b, v0.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f80e113  // udot v19.4s, v8.16b, v0.4b[0]\n"
+      "19:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 5b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbnz %x[flags], #31, 20f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "neg v1.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v1.4s\n"
+      "20:"  // Height 1: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add x10, x10, #0x40\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "tbz %x[flags], #5, 21f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "21:"  // Height 1: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 30f\n"
+      "tbz x12, #3, 25f\n"
+      "str d16, [x9], #0x8\n"
+      "tbz x12, #2, 23f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "tbz x12, #1, 22f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "b 29f\n"
+      "22:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "b 29f\n"
+      "23:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 24f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "b 29f\n"
+      "24:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "b 29f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 27f\n"
+      "str s16, [x9], #0x4\n"
+      "tbz x12, #1, 26f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "b 29f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "b 29f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 28f\n"
+      "str h16, [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "b 29f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "29:"  // Height 1: Partial direct writeback: Done
+      "b 31f\n"
+      "30:"  // Height 1: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "31:"  // Height 1: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 3b\n"
+      "b 126f\n"
+      "32:"  // Height 2
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 33f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "add x25, x25, x19\n"
+      "b 34f\n"
+      "33:"  // Height 2: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "34:"  // Height 2: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "35:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "36:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 37f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x28, 38f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 38f\n"
+      "37:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "38:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "blt 43f\n"
+      "cmp x27, #0x20\n"
+      "blt 41f\n"
+      "39:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x20]\n"
+      "ldr q7, [x11, #0x30]\n"
+      ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q8, [x11, #0x40]\n"
+      ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x80]\n"
+      ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      "tbnz %x[flags], #31, 40f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      "40:"  // Height 2: Multiply loop: unique 5: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x20\n"
+      "bge 39b\n"
+      "41:"  // Height 2: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x20]\n"
+      "ldr q9, [x11, #0x30]\n"
+      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x40]\n"
+      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x80]\n"
+      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x6fa0e150  // udot v16.4s, v10.16b, v0.4b[1]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x6fa1e154  // udot v20.4s, v10.16b, v1.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      ".inst 0x6fa0e091  // udot v17.4s, v4.16b, v0.4b[1]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6fa1e095  // udot v21.4s, v4.16b, v1.4b[1]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x6fa0e0b2  // udot v18.4s, v5.16b, v0.4b[1]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6fa1e0b6  // udot v22.4s, v5.16b, v1.4b[1]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      ".inst 0x6fa0e0d3  // udot v19.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0d7  // udot v23.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x6f80e8f0  // udot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8f4  // udot v20.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      ".inst 0x6f80e911  // udot v17.4s, v8.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6f81e915  // udot v21.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x6f80e932  // udot v18.4s, v9.16b, v0.4b[2]\n"
+      ".inst 0x6f81e936  // udot v22.4s, v9.16b, v1.4b[2]\n"
+      ".inst 0x6f80e953  // udot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x6f81e957  // udot v23.4s, v10.16b, v1.4b[2]\n"
+      ".inst 0x6fa0e890  // udot v16.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e894  // udot v20.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e8b1  // udot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8b5  // udot v21.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e8d2  // udot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8d6  // udot v22.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e8f3  // udot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8f7  // udot v23.4s, v7.16b, v1.4b[3]\n"
+      "tbnz %x[flags], #31, 42f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      "42:"  // Height 2: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "43:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 50f\n"
+      "cmp x27, #0x4\n"
+      "blt 46f\n"
+      "44:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "ldr s1, [x24], #0x4\n"
+      "tbnz %x[flags], #31, 45f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      "45:"  // Height 2: Multiply loop: unique 7: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x6f80e110  // udot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      ".inst 0x6f81e114  // udot v20.4s, v8.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x20]\n"
+      "ldr q4, [x11, #0x30]\n"
+      ".inst 0x6f80e131  // udot v17.4s, v9.16b, v0.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x6f81e135  // udot v21.4s, v9.16b, v1.4b[0]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6f80e152  // udot v18.4s, v10.16b, v0.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f81e156  // udot v22.4s, v10.16b, v1.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      "bge 44b\n"
+      "cbz x27, 50f\n"
+      "46:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 47f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x24], #0x2\n"
+      "tbz x27, #0, 48f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "b 48f\n"
+      "47:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x24, #0x0]\n"
+      "48:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 49f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      "49:"  // Height 2: Multiply loop: unique 8: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x6f80e0b0  // udot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x6f81e0b4  // udot v20.4s, v5.16b, v1.4b[0]\n"
+      "ldr q7, [x11, #0x20]\n"
+      "ldr q8, [x11, #0x30]\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e0f2  // udot v18.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0f6  // udot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f80e113  // udot v19.4s, v8.16b, v0.4b[0]\n"
+      ".inst 0x6f81e117  // udot v23.4s, v8.16b, v1.4b[0]\n"
+      "50:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 36b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbnz %x[flags], #31, 51f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v2.4s }, [x19]\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "neg v2.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v2.4s\n"
+      "mul v12.4s, v12.4s, v2.4s\n"
+      "51:"  // Height 2: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "add v20.4s, v20.4s, v12.4s\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v22.4s, v22.4s, v12.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add v23.4s, v23.4s, v12.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "tbz %x[flags], #5, 52f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "and v8.16b, v20.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v9.16b, v21.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v8.4s\n"
+      "sqadd v21.4s, v21.4s, v9.4s\n"
+      "sqadd v22.4s, v22.4s, v10.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "52:"  // Height 2: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "bge 61f\n"
+      "tbz x12, #3, 56f\n"
+      "str d16, [x9], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x12, #2, 54f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "tbz x12, #1, 53f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "b 60f\n"
+      "53:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "b 60f\n"
+      "54:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 55f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "b 60f\n"
+      "55:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "b 60f\n"
+      "56:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 58f\n"
+      "str s16, [x9], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "tbz x12, #1, 57f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "b 60f\n"
+      "57:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "b 60f\n"
+      "58:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 59f\n"
+      "str h16, [x9], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "b 60f\n"
+      "59:"  // Height 2: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "60:"  // Height 2: Partial direct writeback: Done
+      "b 62f\n"
+      "61:"  // Height 2: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "62:"  // Height 2: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 34b\n"
+      "b 126f\n"
+      "63:"  // Height 3
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 64f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "b 65f\n"
+      "64:"  // Height 3: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "65:"  // Height 3: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "66:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "67:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 68f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x28, 69f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 69f\n"
+      "68:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "69:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "blt 74f\n"
+      "cmp x27, #0x20\n"
+      "blt 72f\n"
+      "70:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x20]\n"
+      ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
+      "ldr q7, [x11, #0x30]\n"
+      "ldr q8, [x11, #0x40]\n"
+      ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x80]\n"
+      ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x6f82e91b  // udot v27.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e938  // udot v24.4s, v9.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e959  // udot v25.4s, v10.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e89a  // udot v26.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8bb  // udot v27.4s, v5.16b, v2.4b[3]\n"
+      "tbnz %x[flags], #31, 71f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "71:"  // Height 3: Multiply loop: unique 9: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "bge 70b\n"
+      "72:"  // Height 3: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x20]\n"
+      ".inst 0x6f82e0d8  // udot v24.4s, v6.16b, v2.4b[0]\n"
+      "ldr q9, [x11, #0x30]\n"
+      "ldr q10, [x11, #0x40]\n"
+      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      ".inst 0x6f82e0f9  // udot v25.4s, v7.16b, v2.4b[0]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x80]\n"
+      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f82e11a  // udot v26.4s, v8.16b, v2.4b[0]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6f82e13b  // udot v27.4s, v9.16b, v2.4b[0]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x6fa0e150  // udot v16.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e154  // udot v20.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e158  // udot v24.4s, v10.16b, v2.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      ".inst 0x6fa0e091  // udot v17.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e095  // udot v21.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e099  // udot v25.4s, v4.16b, v2.4b[1]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x6fa0e0b2  // udot v18.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0b6  // udot v22.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0ba  // udot v26.4s, v5.16b, v2.4b[1]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      ".inst 0x6fa0e0d3  // udot v19.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0d7  // udot v23.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0db  // udot v27.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x6f80e8f0  // udot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8f4  // udot v20.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f8  // udot v24.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      ".inst 0x6f80e911  // udot v17.4s, v8.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6f81e915  // udot v21.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x6f82e919  // udot v25.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x6f80e932  // udot v18.4s, v9.16b, v0.4b[2]\n"
+      ".inst 0x6f81e936  // udot v22.4s, v9.16b, v1.4b[2]\n"
+      ".inst 0x6f82e93a  // udot v26.4s, v9.16b, v2.4b[2]\n"
+      ".inst 0x6f80e953  // udot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x6f81e957  // udot v23.4s, v10.16b, v1.4b[2]\n"
+      ".inst 0x6f82e95b  // udot v27.4s, v10.16b, v2.4b[2]\n"
+      ".inst 0x6fa0e890  // udot v16.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e894  // udot v20.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e898  // udot v24.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e8b1  // udot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8b5  // udot v21.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8b9  // udot v25.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e8d2  // udot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8d6  // udot v22.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8da  // udot v26.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e8f3  // udot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8f7  // udot v23.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8fb  // udot v27.4s, v7.16b, v2.4b[3]\n"
+      "tbnz %x[flags], #31, 73f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "73:"  // Height 3: Multiply loop: unique 10: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "74:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 81f\n"
+      "cmp x27, #0x4\n"
+      "blt 77f\n"
+      "75:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "tbnz %x[flags], #31, 76f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "76:"  // Height 3: Multiply loop: unique 11: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x6f80e110  // udot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      ".inst 0x6f81e114  // udot v20.4s, v8.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x20]\n"
+      ".inst 0x6f82e118  // udot v24.4s, v8.16b, v2.4b[0]\n"
+      "ldr q4, [x11, #0x30]\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x6f80e131  // udot v17.4s, v9.16b, v0.4b[0]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6f81e135  // udot v21.4s, v9.16b, v1.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f82e139  // udot v25.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x6f80e152  // udot v18.4s, v10.16b, v0.4b[0]\n"
+      ".inst 0x6f81e156  // udot v22.4s, v10.16b, v1.4b[0]\n"
+      ".inst 0x6f82e15a  // udot v26.4s, v10.16b, v2.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6f82e09b  // udot v27.4s, v4.16b, v2.4b[0]\n"
+      "bge 75b\n"
+      "cbz x27, 81f\n"
+      "77:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 78f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "tbz x27, #0, 79f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "b 79f\n"
+      "78:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "79:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 80f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "80:"  // Height 3: Multiply loop: unique 12: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x6f80e0b0  // udot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x6f81e0b4  // udot v20.4s, v5.16b, v1.4b[0]\n"
+      "ldr q7, [x11, #0x20]\n"
+      ".inst 0x6f82e0b8  // udot v24.4s, v5.16b, v2.4b[0]\n"
+      "ldr q8, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d9  // udot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f80e0f2  // udot v18.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0f6  // udot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0fa  // udot v26.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f80e113  // udot v19.4s, v8.16b, v0.4b[0]\n"
+      ".inst 0x6f81e117  // udot v23.4s, v8.16b, v1.4b[0]\n"
+      ".inst 0x6f82e11b  // udot v27.4s, v8.16b, v2.4b[0]\n"
+      "81:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 67b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbnz %x[flags], #31, 82f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v3.4s }, [x19]\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "neg v3.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v3.4s\n"
+      "mul v12.4s, v12.4s, v3.4s\n"
+      "mul v13.4s, v13.4s, v3.4s\n"
+      "82:"  // Height 3: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "add v20.4s, v20.4s, v12.4s\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v22.4s, v22.4s, v12.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add v23.4s, v23.4s, v12.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v24.4s, v24.4s, v13.4s\n"
+      "add v25.4s, v25.4s, v13.4s\n"
+      "add v26.4s, v26.4s, v13.4s\n"
+      "add v27.4s, v27.4s, v13.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "tbz %x[flags], #5, 83f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "and v8.16b, v20.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v9.16b, v21.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v8.4s\n"
+      "sqadd v21.4s, v21.4s, v9.4s\n"
+      "sqadd v22.4s, v22.4s, v10.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v7.16b, v26.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v8.16b, v27.16b, v0.16b\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v26.4s, v26.4s, v7.4s\n"
+      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "83:"  // Height 3: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "bge 92f\n"
+      "tbz x12, #3, 87f\n"
+      "str d16, [x9], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x12, #2, 85f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "tbz x12, #1, 84f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "st1 { v24.h }[6], [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "st1 { v24.b }[14], [x23]\n"
+      "b 91f\n"
+      "84:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "st1 { v24.b }[12], [x23]\n"
+      "b 91f\n"
+      "85:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 86f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "st1 { v24.h }[4], [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "st1 { v24.b }[10], [x23]\n"
+      "b 91f\n"
+      "86:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "st1 { v24.b }[8], [x23]\n"
+      "b 91f\n"
+      "87:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 89f\n"
+      "str s16, [x9], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "tbz x12, #1, 88f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "st1 { v24.h }[2], [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "st1 { v24.b }[6], [x23]\n"
+      "b 91f\n"
+      "88:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "st1 { v24.b }[4], [x23]\n"
+      "b 91f\n"
+      "89:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 90f\n"
+      "str h16, [x9], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "str h24, [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "st1 { v24.b }[2], [x23]\n"
+      "b 91f\n"
+      "90:"  // Height 3: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "str b24, [x23, #0x0]\n"
+      "91:"  // Height 3: Partial direct writeback: Done
+      "b 93f\n"
+      "92:"  // Height 3: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q24, [x23, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "93:"  // Height 3: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 65b\n"
+      "b 126f\n"
+      "94:"  // Height 4
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 95f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "ldr x21, [%x[output_ptr], #0x18]\n"
+      "add x25, x25, x19\n"
+      "add %x[output_ptr], %x[output_ptr], #0x20\n"
+      "add x23, x23, x19\n"
+      "add x21, x21, x19\n"
+      "b 96f\n"
+      "95:"  // Height 4: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "add x21, x23, x19\n"
+      "add %x[output_ptr], x21, x19\n"
+      "96:"  // Height 4: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "97:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "98:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 99f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x28, 100f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 100f\n"
+      "99:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "100:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "blt 105f\n"
+      "cmp x27, #0x20\n"
+      "blt 103f\n"
+      "101:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x20, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x20]\n"
+      ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
+      "ldr q7, [x11, #0x30]\n"
+      ".inst 0x6f83e09c  // udot v28.4s, v4.16b, v3.4b[0]\n"
+      "ldr q8, [x11, #0x40]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f83e0bd  // udot v29.4s, v5.16b, v3.4b[0]\n"
+      "ldr q5, [x11, #0x80]\n"
+      ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6f83e0de  // udot v30.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0ff  // udot v31.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e11c  // udot v28.4s, v8.16b, v3.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e13d  // udot v29.4s, v9.16b, v3.4b[1]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e15e  // udot v30.4s, v10.16b, v3.4b[1]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e09f  // udot v31.4s, v4.16b, v3.4b[1]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8bc  // udot v28.4s, v5.16b, v3.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8dd  // udot v29.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8fe  // udot v30.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x6f82e91b  // udot v27.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x6f83e91f  // udot v31.4s, v8.16b, v3.4b[2]\n"
+      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e938  // udot v24.4s, v9.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e93c  // udot v28.4s, v9.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e959  // udot v25.4s, v10.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e95d  // udot v29.4s, v10.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e89a  // udot v26.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e89e  // udot v30.4s, v4.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8bb  // udot v27.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8bf  // udot v31.4s, v5.16b, v3.4b[3]\n"
+      "tbnz %x[flags], #31, 102f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
+      "102:"  // Height 4: Multiply loop: unique 13: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bge 101b\n"
+      "103:"  // Height 4: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x20, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x20]\n"
+      ".inst 0x6f82e0d8  // udot v24.4s, v6.16b, v2.4b[0]\n"
+      "ldr q9, [x11, #0x30]\n"
+      ".inst 0x6f83e0dc  // udot v28.4s, v6.16b, v3.4b[0]\n"
+      "ldr q10, [x11, #0x40]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x6f82e0f9  // udot v25.4s, v7.16b, v2.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f83e0fd  // udot v29.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x11, #0x80]\n"
+      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6f82e11a  // udot v26.4s, v8.16b, v2.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6f83e11e  // udot v30.4s, v8.16b, v3.4b[0]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
+      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
+      ".inst 0x6f82e13b  // udot v27.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x6f83e13f  // udot v31.4s, v9.16b, v3.4b[0]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x6fa0e150  // udot v16.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e154  // udot v20.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e158  // udot v24.4s, v10.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e15c  // udot v28.4s, v10.16b, v3.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      ".inst 0x6fa0e091  // udot v17.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e095  // udot v21.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e099  // udot v25.4s, v4.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e09d  // udot v29.4s, v4.16b, v3.4b[1]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x6fa0e0b2  // udot v18.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0b6  // udot v22.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0ba  // udot v26.4s, v5.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0be  // udot v30.4s, v5.16b, v3.4b[1]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      ".inst 0x6fa0e0d3  // udot v19.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0d7  // udot v23.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0db  // udot v27.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0df  // udot v31.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x6f80e8f0  // udot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8f4  // udot v20.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f8  // udot v24.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8fc  // udot v28.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      ".inst 0x6f80e911  // udot v17.4s, v8.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6f81e915  // udot v21.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x6f82e919  // udot v25.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x6f83e91d  // udot v29.4s, v8.16b, v3.4b[2]\n"
+      ".inst 0x6f80e932  // udot v18.4s, v9.16b, v0.4b[2]\n"
+      ".inst 0x6f81e936  // udot v22.4s, v9.16b, v1.4b[2]\n"
+      ".inst 0x6f82e93a  // udot v26.4s, v9.16b, v2.4b[2]\n"
+      ".inst 0x6f83e93e  // udot v30.4s, v9.16b, v3.4b[2]\n"
+      ".inst 0x6f80e953  // udot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x6f81e957  // udot v23.4s, v10.16b, v1.4b[2]\n"
+      ".inst 0x6f82e95b  // udot v27.4s, v10.16b, v2.4b[2]\n"
+      ".inst 0x6f83e95f  // udot v31.4s, v10.16b, v3.4b[2]\n"
+      ".inst 0x6fa0e890  // udot v16.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e894  // udot v20.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e898  // udot v24.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e89c  // udot v28.4s, v4.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e8b1  // udot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8b5  // udot v21.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8b9  // udot v25.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8bd  // udot v29.4s, v5.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e8d2  // udot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8d6  // udot v22.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8da  // udot v26.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8de  // udot v30.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e8f3  // udot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8f7  // udot v23.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8fb  // udot v27.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8ff  // udot v31.4s, v7.16b, v3.4b[3]\n"
+      "tbnz %x[flags], #31, 104f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
+      "104:"  // Height 4: Multiply loop: unique 14: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "105:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 112f\n"
+      "cmp x27, #0x4\n"
+      "blt 108f\n"
+      "106:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "ldr s3, [x20], #0x4\n"
+      "tbnz %x[flags], #31, 107f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
+      "107:"  // Height 4: Multiply loop: unique 15: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x6f80e110  // udot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      ".inst 0x6f81e114  // udot v20.4s, v8.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x20]\n"
+      ".inst 0x6f82e118  // udot v24.4s, v8.16b, v2.4b[0]\n"
+      "ldr q4, [x11, #0x30]\n"
+      ".inst 0x6f83e11c  // udot v28.4s, v8.16b, v3.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f80e131  // udot v17.4s, v9.16b, v0.4b[0]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6f81e135  // udot v21.4s, v9.16b, v1.4b[0]\n"
+      ".inst 0x6f82e139  // udot v25.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x6f83e13d  // udot v29.4s, v9.16b, v3.4b[0]\n"
+      ".inst 0x6f80e152  // udot v18.4s, v10.16b, v0.4b[0]\n"
+      ".inst 0x6f81e156  // udot v22.4s, v10.16b, v1.4b[0]\n"
+      ".inst 0x6f82e15a  // udot v26.4s, v10.16b, v2.4b[0]\n"
+      ".inst 0x6f83e15e  // udot v30.4s, v10.16b, v3.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6f82e09b  // udot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x6f83e09f  // udot v31.4s, v4.16b, v3.4b[0]\n"
+      "bge 106b\n"
+      "cbz x27, 112f\n"
+      "108:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 109f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "ldr h3, [x20], #0x2\n"
+      "tbz x27, #0, 110f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "ld1 { v3.b }[2], [x20]\n"
+      "b 110f\n"
+      "109:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "ldr b3, [x20, #0x0]\n"
+      "110:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 111f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
+      "111:"  // Height 4: Multiply loop: unique 16: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x6f80e0b0  // udot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x6f81e0b4  // udot v20.4s, v5.16b, v1.4b[0]\n"
+      "ldr q7, [x11, #0x20]\n"
+      ".inst 0x6f82e0b8  // udot v24.4s, v5.16b, v2.4b[0]\n"
+      "ldr q8, [x11, #0x30]\n"
+      ".inst 0x6f83e0bc  // udot v28.4s, v5.16b, v3.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d9  // udot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0dd  // udot v29.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0f2  // udot v18.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0f6  // udot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0fa  // udot v26.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0fe  // udot v30.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f80e113  // udot v19.4s, v8.16b, v0.4b[0]\n"
+      ".inst 0x6f81e117  // udot v23.4s, v8.16b, v1.4b[0]\n"
+      ".inst 0x6f82e11b  // udot v27.4s, v8.16b, v2.4b[0]\n"
+      ".inst 0x6f83e11f  // udot v31.4s, v8.16b, v3.4b[0]\n"
+      "112:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 98b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbnz %x[flags], #31, 113f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "addp v14.4s, v14.4s, v14.4s\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "addp v14.4s, v14.4s, v14.4s\n"
+      "neg v4.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v4.4s\n"
+      "mul v12.4s, v12.4s, v4.4s\n"
+      "mul v13.4s, v13.4s, v4.4s\n"
+      "mul v14.4s, v14.4s, v4.4s\n"
+      "113:"  // Height 4: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "add v20.4s, v20.4s, v12.4s\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v22.4s, v22.4s, v12.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add v23.4s, v23.4s, v12.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v24.4s, v24.4s, v13.4s\n"
+      "add v25.4s, v25.4s, v13.4s\n"
+      "add v26.4s, v26.4s, v13.4s\n"
+      "add v27.4s, v27.4s, v13.4s\n"
+      "add v28.4s, v28.4s, v14.4s\n"
+      "add v29.4s, v29.4s, v14.4s\n"
+      "add v30.4s, v30.4s, v14.4s\n"
+      "add v31.4s, v31.4s, v14.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v29.4s, v29.4s, v1.4s\n"
+      "add v30.4s, v30.4s, v2.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "tbz %x[flags], #5, 114f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "and v8.16b, v20.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v9.16b, v21.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v8.4s\n"
+      "sqadd v21.4s, v21.4s, v9.4s\n"
+      "sqadd v22.4s, v22.4s, v10.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v7.16b, v26.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v8.16b, v27.16b, v0.16b\n"
+      "and v9.16b, v28.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "and v10.16b, v29.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "and v4.16b, v30.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v26.4s, v26.4s, v7.4s\n"
+      "and v5.16b, v31.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v9.4s\n"
+      "sqadd v29.4s, v29.4s, v10.4s\n"
+      "sqadd v30.4s, v30.4s, v4.4s\n"
+      "sqadd v31.4s, v31.4s, v5.4s\n"
+      "114:"  // Height 4: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "srshl v28.4s, v28.4s, v0.4s\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "srshl v29.4s, v29.4s, v0.4s\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "srshl v30.4s, v30.4s, v0.4s\n"
+      "smin v28.4s, v28.4s, v6.4s\n"
+      "smin v29.4s, v29.4s, v6.4s\n"
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "smax v28.4s, v28.4s, v5.4s\n"
+      "smax v29.4s, v29.4s, v5.4s\n"
+      "add v30.4s, v30.4s, v4.4s\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v30.4s, v30.4s, v6.4s\n"
+      "smin v31.4s, v31.4s, v6.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smax v30.4s, v30.4s, v5.4s\n"
+      "smax v31.4s, v31.4s, v5.4s\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v28.8h, v28.8h, v29.8h\n"
+      "uzp1 v29.8h, v30.8h, v31.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "bge 123f\n"
+      "tbz x12, #3, 118f\n"
+      "str d16, [x9], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x12, #2, 116f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x12, #1, 115f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "st1 { v24.h }[6], [x23], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "st1 { v24.b }[14], [x23]\n"
+      "st1 { v28.b }[14], [x21]\n"
+      "b 122f\n"
+      "115:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "st1 { v24.b }[12], [x23]\n"
+      "st1 { v28.b }[12], [x21]\n"
+      "b 122f\n"
+      "116:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 117f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "st1 { v24.h }[4], [x23], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "st1 { v24.b }[10], [x23]\n"
+      "st1 { v28.b }[10], [x21]\n"
+      "b 122f\n"
+      "117:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "st1 { v24.b }[8], [x23]\n"
+      "st1 { v28.b }[8], [x21]\n"
+      "b 122f\n"
+      "118:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 120f\n"
+      "str s16, [x9], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x12, #1, 119f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "st1 { v24.h }[2], [x23], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "st1 { v24.b }[6], [x23]\n"
+      "st1 { v28.b }[6], [x21]\n"
+      "b 122f\n"
+      "119:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "st1 { v24.b }[4], [x23]\n"
+      "st1 { v28.b }[4], [x21]\n"
+      "b 122f\n"
+      "120:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 121f\n"
+      "str h16, [x9], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "str h24, [x23], #0x2\n"
+      "str h28, [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "st1 { v24.b }[2], [x23]\n"
+      "st1 { v28.b }[2], [x21]\n"
+      "b 122f\n"
+      "121:"  // Height 4: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "str b24, [x23, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
+      "122:"  // Height 4: Partial direct writeback: Done
+      "b 124f\n"
+      "123:"  // Height 4: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x21, x21, #0x10\n"
+      "124:"  // Height 4: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 96b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 126f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 125f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "125:"  // Update direct input
+      "mov x19, #0x4\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "126:"  // Exit
+
+      : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
deleted file mode 100644
index 735e5fd45a..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
+++ /dev/null
@@ -1,2434 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long blocks_count = K / 4;
-    const long odds_count = K - (blocks_count * 4);
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const uint8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(uint8_t);
-
-        uint32_t *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            long odds = odds_count;
-            const uint8_t *a_ptr0 = a_ptr0_base;
-            const uint8_t *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            uint32_t result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
-            uint32_t *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "temploadreg0 .req X0\n"
-                        "temploadreg1 .req X1\n"
-                        "temploadreg2 .req X2\n"
-                        "temploadreg3 .req X3\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "temploadreg0 .req X2\n"
-                        "temploadreg1 .req X3\n"
-                        "temploadreg2 .req X4\n"
-                        "temploadreg3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "temploadreg0 .req X4\n"
-                        "temploadreg1 .req X5\n"
-                        "temploadreg2 .req X6\n"
-                        "temploadreg3 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "temploadreg0 .req X6\n"
-                        "temploadreg1 .req X7\n"
-                        "temploadreg2 .req X8\n"
-                        "temploadreg3 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v28.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr d3, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[0], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[1], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "ld1 {v3.b}[2], [a_ptr3]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
deleted file mode 100644
index 2e86233a06..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
+++ /dev/null
@@ -1,1808 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long blocks_count = K / 4;
-    const long odds_count = K - (blocks_count * 4);
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const uint8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(uint8_t);
-
-        uint32_t *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            long odds = odds_count;
-            const uint8_t *a_ptr0 = a_ptr0_base;
-            const uint8_t *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            uint32_t result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
-            uint32_t *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v27.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v28.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[0], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[1], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "ld1 {v3.b}[2], [a_ptr3]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
new file mode 100644
index 0000000000..238c1825f3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<uint8_t>, \
+   size_t, size_t, \
+   const uint8_t *, \
+   IndirectOutputArg<uint32_t>, \
+   const uint32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_u8u32_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_u8u32_dot_6x16
+{
+public:
+    typedef uint8_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_u8u32_dot_6x16;
+
+    cls_a64_hybrid_u8u32_dot_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..3c8654147a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
@@ -0,0 +1,3335 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8u32_dot_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+    const uint32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 176f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 141f\n"
+      "beq 106f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 71f\n"
+      "beq 36f\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "tbz %x[flags], #0, 13f\n"
+      "cmp x15, #0x10\n"
+      "bge 12f\n"
+      "tbz x15, #3, 7f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "tbz x15, #2, 5f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 4f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "b 11f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 11f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "b 11f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x15, #1, 6f\n"
+      "ldr d10, [x13], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "b 11f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 11f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "b 11f\n"
+      "7:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x15, #2, 9f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 8f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "b 11f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 11f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "b 11f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x15, #1, 10f\n"
+      "ldr d8, [x13], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "b 11f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "11:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 14f\n"
+      "12:"  // Height 1: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "b 14f\n"
+      "13:"  // Height 1: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "14:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "15:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 16f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 17f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "b 17f\n"
+      "16:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "17:"  // Height 1: input setup done
+      "cmp x11, #0x10\n"
+      "blt 20f\n"
+      "cmp x11, #0x20\n"
+      "blt 19f\n"
+      "18:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      "bge 18b\n"
+      "19:"  // Height 1: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      "20:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x11, 25f\n"
+      "cmp x11, #0x4\n"
+      "blt 22f\n"
+      "21:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "cmp x11, #0x4\n"
+      "bge 21b\n"
+      "cbz x11, 25f\n"
+      "22:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 23f\n"
+      "ldr h0, [x10], #0x2\n"
+      "tbz x11, #0, 24f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "b 24f\n"
+      "23:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "24:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "25:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 15b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "bge 34f\n"
+      "tbz x15, #3, 29f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "tbz x15, #2, 27f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 26f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "b 33f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 33f\n"
+      "str s11, [x13, #0x0]\n"
+      "b 33f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 28f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "b 33f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 33f\n"
+      "str s10, [x13, #0x0]\n"
+      "b 33f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 31f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 30f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "b 33f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 33f\n"
+      "str s9, [x13, #0x0]\n"
+      "b 33f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 32f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "b 33f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "33:"  // Height 1: Partial direct writeback: Done
+      "b 35f\n"
+      "34:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "35:"  // Height 1: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 3b\n"
+      "b 212f\n"
+      "36:"  // Height 2
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 37f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 38f\n"
+      "37:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "38:"  // Height 2: Column loop
+      "tbz %x[flags], #0, 48f\n"
+      "cmp x15, #0x10\n"
+      "bge 47f\n"
+      "tbz x15, #3, 42f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "tbz x15, #2, 40f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 39f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "b 46f\n"
+      "39:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 46f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "b 46f\n"
+      "40:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x15, #1, 41f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "b 46f\n"
+      "41:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 46f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "b 46f\n"
+      "42:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x15, #2, 44f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 43f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "b 46f\n"
+      "43:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 46f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "b 46f\n"
+      "44:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x15, #1, 45f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "b 46f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "46:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "b 49f\n"
+      "47:"  // Height 2: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "b 49f\n"
+      "48:"  // Height 2: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "49:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "50:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 51f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 52f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "b 52f\n"
+      "51:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "52:"  // Height 2: input setup done
+      "cmp x11, #0x10\n"
+      "blt 55f\n"
+      "cmp x11, #0x20\n"
+      "blt 54f\n"
+      "53:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      "bge 53b\n"
+      "54:"  // Height 2: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      "55:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x11, 60f\n"
+      "cmp x11, #0x4\n"
+      "blt 57f\n"
+      "56:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      "bge 56b\n"
+      "cbz x11, 60f\n"
+      "57:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 58f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "tbz x11, #0, 59f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "b 59f\n"
+      "58:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "59:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      "60:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 50b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "bge 69f\n"
+      "tbz x15, #3, 64f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "tbz x15, #2, 62f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 61f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "b 68f\n"
+      "61:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 68f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "b 68f\n"
+      "62:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 63f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "b 68f\n"
+      "63:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 68f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "b 68f\n"
+      "64:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 66f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 65f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "b 68f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 68f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "b 68f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 67f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "b 68f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "68:"  // Height 2: Partial direct writeback: Done
+      "b 70f\n"
+      "69:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "70:"  // Height 2: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 38b\n"
+      "b 212f\n"
+      "71:"  // Height 3
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 73f\n"
+      "72:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "73:"  // Height 3: Column loop
+      "tbz %x[flags], #0, 83f\n"
+      "cmp x15, #0x10\n"
+      "bge 82f\n"
+      "tbz x15, #3, 77f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "tbz x15, #2, 75f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 74f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "b 81f\n"
+      "74:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 81f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "b 81f\n"
+      "75:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x15, #1, 76f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "b 81f\n"
+      "76:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 81f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "b 81f\n"
+      "77:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x15, #2, 79f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 78f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "b 81f\n"
+      "78:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 81f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "b 81f\n"
+      "79:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x15, #1, 80f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "b 81f\n"
+      "80:"  // Height 3: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "81:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "b 84f\n"
+      "82:"  // Height 3: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "b 84f\n"
+      "83:"  // Height 3: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "84:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "85:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 86f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 87f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "b 87f\n"
+      "86:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "87:"  // Height 3: input setup done
+      "cmp x11, #0x10\n"
+      "blt 90f\n"
+      "cmp x11, #0x20\n"
+      "blt 89f\n"
+      "88:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      "bge 88b\n"
+      "89:"  // Height 3: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      "90:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x11, 95f\n"
+      "cmp x11, #0x4\n"
+      "blt 92f\n"
+      "91:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      "bge 91b\n"
+      "cbz x11, 95f\n"
+      "92:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 93f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "tbz x11, #0, 94f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "b 94f\n"
+      "93:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "94:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      "95:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 85b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "bge 104f\n"
+      "tbz x15, #3, 99f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "tbz x15, #2, 97f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 96f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "b 103f\n"
+      "96:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 103f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "b 103f\n"
+      "97:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 98f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "b 103f\n"
+      "98:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 103f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "b 103f\n"
+      "99:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 101f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 100f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "b 103f\n"
+      "100:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 103f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "b 103f\n"
+      "101:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 102f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "b 103f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "103:"  // Height 3: Partial direct writeback: Done
+      "b 105f\n"
+      "104:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "105:"  // Height 3: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 73b\n"
+      "b 212f\n"
+      "106:"  // Height 4
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 107f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 108f\n"
+      "107:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "108:"  // Height 4: Column loop
+      "tbz %x[flags], #0, 118f\n"
+      "cmp x15, #0x10\n"
+      "bge 117f\n"
+      "tbz x15, #3, 112f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "tbz x15, #2, 110f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 109f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "b 116f\n"
+      "109:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 116f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "b 116f\n"
+      "110:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x15, #1, 111f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "b 116f\n"
+      "111:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 116f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "b 116f\n"
+      "112:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x15, #2, 114f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 113f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "b 116f\n"
+      "113:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 116f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "b 116f\n"
+      "114:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x15, #1, 115f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "b 116f\n"
+      "115:"  // Height 4: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "116:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "b 119f\n"
+      "117:"  // Height 4: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "b 119f\n"
+      "118:"  // Height 4: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "119:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "120:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 121f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 122f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 122f\n"
+      "121:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "122:"  // Height 4: input setup done
+      "cmp x11, #0x10\n"
+      "blt 125f\n"
+      "cmp x11, #0x20\n"
+      "blt 124f\n"
+      "123:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      "bge 123b\n"
+      "124:"  // Height 4: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      "125:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x11, 130f\n"
+      "cmp x11, #0x4\n"
+      "blt 127f\n"
+      "126:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      "bge 126b\n"
+      "cbz x11, 130f\n"
+      "127:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 128f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "tbz x11, #0, 129f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "b 129f\n"
+      "128:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "129:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      "130:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 120b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "bge 139f\n"
+      "tbz x15, #3, 134f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "tbz x15, #2, 132f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 131f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "b 138f\n"
+      "131:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 138f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "b 138f\n"
+      "132:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 133f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "b 138f\n"
+      "133:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 138f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "b 138f\n"
+      "134:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 136f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 135f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "b 138f\n"
+      "135:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 138f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "b 138f\n"
+      "136:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 137f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "b 138f\n"
+      "137:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "138:"  // Height 4: Partial direct writeback: Done
+      "b 140f\n"
+      "139:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "140:"  // Height 4: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 108b\n"
+      "b 212f\n"
+      "141:"  // Height 5
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 142f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 143f\n"
+      "142:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "143:"  // Height 5: Column loop
+      "tbz %x[flags], #0, 153f\n"
+      "cmp x15, #0x10\n"
+      "bge 152f\n"
+      "tbz x15, #3, 147f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "tbz x15, #2, 145f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 144f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "b 151f\n"
+      "144:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 151f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "b 151f\n"
+      "145:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x15, #1, 146f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "b 151f\n"
+      "146:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 151f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "b 151f\n"
+      "147:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x15, #2, 149f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 148f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "b 151f\n"
+      "148:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 151f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "b 151f\n"
+      "149:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x15, #1, 150f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "b 151f\n"
+      "150:"  // Height 5: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "151:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "b 154f\n"
+      "152:"  // Height 5: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "b 154f\n"
+      "153:"  // Height 5: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "154:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "155:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 156f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 157f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 157f\n"
+      "156:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "157:"  // Height 5: input setup done
+      "cmp x11, #0x10\n"
+      "blt 160f\n"
+      "cmp x11, #0x20\n"
+      "blt 159f\n"
+      "158:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
+      "bge 158b\n"
+      "159:"  // Height 5: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
+      "160:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x11, 165f\n"
+      "cmp x11, #0x4\n"
+      "blt 162f\n"
+      "161:"  // Height 5: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      "bge 161b\n"
+      "cbz x11, 165f\n"
+      "162:"  // Height 5: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 163f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "tbz x11, #0, 164f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "b 164f\n"
+      "163:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "164:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      "165:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 155b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "bge 174f\n"
+      "tbz x15, #3, 169f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "tbz x15, #2, 167f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 166f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "b 173f\n"
+      "166:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 173f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "b 173f\n"
+      "167:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 168f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "b 173f\n"
+      "168:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 173f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "b 173f\n"
+      "169:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 171f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 170f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "b 173f\n"
+      "170:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 173f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "b 173f\n"
+      "171:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 172f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "b 173f\n"
+      "172:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "173:"  // Height 5: Partial direct writeback: Done
+      "b 175f\n"
+      "174:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "175:"  // Height 5: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 143b\n"
+      "b 212f\n"
+      "176:"  // Height 6
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 177f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 178f\n"
+      "177:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "178:"  // Height 6: Column loop
+      "tbz %x[flags], #0, 188f\n"
+      "cmp x15, #0x10\n"
+      "bge 187f\n"
+      "tbz x15, #3, 182f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x15, #2, 180f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 179f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 186f\n"
+      "179:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 186f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 186f\n"
+      "180:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x15, #1, 181f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 186f\n"
+      "181:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 186f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 186f\n"
+      "182:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x15, #2, 184f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 183f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 186f\n"
+      "183:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 186f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 186f\n"
+      "184:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x15, #1, 185f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 186f\n"
+      "185:"  // Height 6: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "186:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "sub x21, x21, x19\n"
+      "b 189f\n"
+      "187:"  // Height 6: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "b 189f\n"
+      "188:"  // Height 6: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "189:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "190:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 191f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 192f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 192f\n"
+      "191:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "192:"  // Height 6: input setup done
+      "cmp x11, #0x10\n"
+      "blt 195f\n"
+      "cmp x11, #0x20\n"
+      "blt 194f\n"
+      "193:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0dc  // udot v28.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0fd  // udot v29.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0de  // udot v30.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0ff  // udot v31.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8dc  // udot v28.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8fd  // udot v29.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8de  // udot v30.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8ff  // udot v31.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8dc  // udot v28.4s, v6.16b, v5.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8fd  // udot v29.4s, v7.16b, v5.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8de  // udot v30.4s, v6.16b, v5.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8ff  // udot v31.4s, v7.16b, v5.4b[3]\n"
+      "bge 193b\n"
+      "194:"  // Height 6: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0dc  // udot v28.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0fd  // udot v29.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0de  // udot v30.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0ff  // udot v31.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8dc  // udot v28.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8fd  // udot v29.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8de  // udot v30.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8ff  // udot v31.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8dc  // udot v28.4s, v6.16b, v5.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8fd  // udot v29.4s, v7.16b, v5.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8de  // udot v30.4s, v6.16b, v5.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8ff  // udot v31.4s, v7.16b, v5.4b[3]\n"
+      "195:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x11, 200f\n"
+      "cmp x11, #0x4\n"
+      "blt 197f\n"
+      "196:"  // Height 6: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x20], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      "bge 196b\n"
+      "cbz x11, 200f\n"
+      "197:"  // Height 6: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 198f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x20], #0x2\n"
+      "tbz x11, #0, 199f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x20]\n"
+      "b 199f\n"
+      "198:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x20, #0x0]\n"
+      "199:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      "200:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 190b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "bge 209f\n"
+      "tbz x15, #3, 204f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "tbz x15, #2, 202f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 201f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 208f\n"
+      "201:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 208f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "b 208f\n"
+      "202:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 203f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "b 208f\n"
+      "203:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 208f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "b 208f\n"
+      "204:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 206f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 205f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "b 208f\n"
+      "205:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 208f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "b 208f\n"
+      "206:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 207f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "b 208f\n"
+      "207:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "208:"  // Height 6: Partial direct writeback: Done
+      "b 210f\n"
+      "209:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "add x21, x21, #0x40\n"
+      "210:"  // Height 6: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 178b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 212f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 211f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "211:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "212:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp
deleted file mode 100644
index 58a51432fd..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-
-namespace arm_gemm {
-
-void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const bfloat16 *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    K /= 2;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const bfloat16 *a_ptr0 = a_ptr;
-        const bfloat16 *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "movi v8.4s, #0\n"
-                "ldr q0, [%[a_ptr]]\n"
-                "movi v9.4s, #0\n"
-                "ldr q2, [%[b_ptr]]\n"
-                "movi v10.4s, #0\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                "movi v11.4s, #0\n"
-                "ldr q3, [%[b_ptr], #0x10]\n"
-                "movi v12.4s, #0\n"
-                "ldr q4, [%[b_ptr], #0x20]\n"
-                "movi v13.4s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                "movi v14.4s, #0\n"
-                "add %[b_ptr], %[b_ptr], #0x30\n"
-                "movi v15.4s, #0\n"
-                "movi v16.4s, #0\n"
-                "movi v17.4s, #0\n"
-                "movi v18.4s, #0\n"
-                "movi v19.4s, #0\n"
-                "movi v20.4s, #0\n"
-                "movi v21.4s, #0\n"
-                "movi v22.4s, #0\n"
-                "movi v23.4s, #0\n"
-                "movi v24.4s, #0\n"
-                "movi v25.4s, #0\n"
-                "movi v26.4s, #0\n"
-                "movi v27.4s, #0\n"
-                "movi v28.4s, #0\n"
-                "movi v29.4s, #0\n"
-                "movi v30.4s, #0\n"
-                "movi v31.4s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                "ldr q2, [%[b_ptr]]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                "ldr q3, [%[b_ptr], #0x10]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr], #0x20]\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                "add %[b_ptr], %[b_ptr], #0x60\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                "ldr q2, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                "ldr q3, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr], #-0x10]\n"
-                "ldr q1, [%[a_ptr], #-0x10]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                "ldr q2, [%[b_ptr]]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                "ldr q3, [%[b_ptr], #0x10]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr], #0x20]\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                "add %[b_ptr], %[b_ptr], #0x60\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                "ldr q2, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                "ldr q3, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                "ldr q1, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                "str q8, [%[c_ptr]]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                "str q12, [%[c_ptr], #0x10]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "str q16, [%[c_ptr], #0x20]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                "str q9, [%[c_ptr], #0x30]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                "add %[b_ptr], %[b_ptr], #0x30\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                "ldr q2, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                "ldr q3, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                "ldr q1, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                "str q8, [%[c_ptr]]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                "str q12, [%[c_ptr], #0x10]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "str q16, [%[c_ptr], #0x20]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                "str q9, [%[c_ptr], #0x30]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "4:\n"
-                "str q13, [%[c_ptr], #0x40]\n"
-                "str q17, [%[c_ptr], #0x50]\n"
-                "str q10, [%[c_ptr], #0x60]\n"
-                "str q14, [%[c_ptr], #0x70]\n"
-                "str q18, [%[c_ptr], #0x80]\n"
-                "str q11, [%[c_ptr], #0x90]\n"
-                "str q15, [%[c_ptr], #0xa0]\n"
-                "str q19, [%[c_ptr], #0xb0]\n"
-                "str q20, [%[c_ptr], #0xc0]\n"
-                "str q24, [%[c_ptr], #0xd0]\n"
-                "str q28, [%[c_ptr], #0xe0]\n"
-                "str q21, [%[c_ptr], #0xf0]\n"
-                "str q25, [%[c_ptr], #0x100]\n"
-                "str q29, [%[c_ptr], #0x110]\n"
-                "str q22, [%[c_ptr], #0x120]\n"
-                "str q26, [%[c_ptr], #0x130]\n"
-                "str q30, [%[c_ptr], #0x140]\n"
-                "str q23, [%[c_ptr], #0x150]\n"
-                "str q27, [%[c_ptr], #0x160]\n"
-                "str q31, [%[c_ptr], #0x170]\n"
-                "add %[c_ptr], %[c_ptr], #0x180\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
index 95fed86c2f..2fea5ad2e7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
@@ -31,10 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
-void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *, const bfloat16 *, float *, int, int, int);
 
-class interleaved_bf16fp32_dot_12x8 {
+class cls_a64_interleaved_bf16fp32_dot_8x12 {
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
@@ -60,13 +59,11 @@ public:
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12, 2> transforms = {};
 
-    kern_type kernel=a64_interleaved_bf16fp32_dot_12x8;
+    kern_type kernel=a64_interleaved_bf16fp32_dot_8x12;
 
-    interleaved_bf16fp32_dot_12x8(const CPUInfo *ci)
+    cls_a64_interleaved_bf16fp32_dot_8x12(const CPUInfo *)
     {
-        if (ci->get_cpu_model() == CPUModel::X1) {
-            kernel = a64_interleaved_bf16fp32_dot_12x8_x1;
-        }
+
     }
 };
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
index 7ffae524dc..92149a5579 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const bfloat16 *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
index 7fac59947e..b2c2407b28 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *, const bfloat16 *, float *, int, int, int);
 
-class interleaved_bf16fp32_mmla_12x8 {
+class cls_a64_interleaved_bf16fp32_mmla_8x12 {
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
@@ -59,9 +59,9 @@ public:
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
 
-    kern_type kernel=a64_interleaved_bf16fp32_mmla_12x8;
+    kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12;
 
-    interleaved_bf16fp32_mmla_12x8(const CPUInfo *)
+    cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
index 7f0eff29af..c476fcf171 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const bfloat16 *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
@@ -87,13 +87,23 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *
                 "movi v27.4s, #0\n"
                 "prfm PLDL1KEEP, [%[b_ptr], #0x1c0]\n"
                 "movi v28.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n"
                 "movi v29.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n"
                 "movi v30.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x240]\n"
                 "movi v31.4s, #0\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x280]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x2c0]\n"
                 "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x380]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x3c0]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x400]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x440]\n"
                 "add %[a_ptr], %[a_ptr], #0x40\n"
                 "add %[b_ptr], %[b_ptr], #0x40\n"
                 "cbz %[loops], 1f\n"
@@ -105,19 +115,19 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *
                 ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
                 "subs %[loops], %[loops], #0x1\n"
                 ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x2c0]\n"
                 ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
                 "ldr q4, [%[b_ptr]]\n"
                 ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x440]\n"
                 ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x300]\n"
                 ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
                 "ldr q5, [%[b_ptr], #0x10]\n"
                 ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x480]\n"
                 ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x4c0]\n"
                 ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
                 ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
                 "ldr q6, [%[b_ptr], #0x20]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
index 7bfb2291a9..b17b76f170 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_interleaved_s8s32_mmla_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_interleaved_s8s32_mmla_8x12(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
-class interleaved_s8s32_mmla_12x8 {
+class cls_a64_interleaved_s8s32_mmla_8x12 {
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
@@ -58,10 +58,11 @@ public:
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
 
-    kern_type kernel=a64_interleaved_s8s32_mmla_12x8;
+    kern_type kernel=a64_interleaved_s8s32_mmla_8x12;
 
-    interleaved_s8s32_mmla_12x8(const CPUInfo *)
+    cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
index 7953510aa7..2093e75b8e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_s8s32_mmla_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
     const int8_t *a_ptr = Apanel;
     int32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
index d493517cf1..99dd0be0d9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_interleaved_u8u32_mmla_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_interleaved_u8u32_mmla_8x12(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 
-class interleaved_u8u32_mmla_12x8 {
+class cls_a64_interleaved_u8u32_mmla_8x12 {
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
@@ -58,10 +58,11 @@ public:
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
 
-    kern_type kernel=a64_interleaved_u8u32_mmla_12x8;
+    kern_type kernel=a64_interleaved_u8u32_mmla_8x12;
 
-    interleaved_u8u32_mmla_12x8(const CPUInfo *)
+    cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
index dcd15f0345..568e5d1098 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
     const uint8_t *a_ptr = Apanel;
     uint32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
index 981ce34b49..d77e1b0ac2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
@@ -30,13 +30,13 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_sgemm_asimd_12x8(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_a53(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_a55(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_a55r1(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_x1(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_a53(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_a55(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_a55r1(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_x1(const float *, const float *, float *, int, int, int);
 
-// 12x8 SGEMM "strategy" class.
+// 8x12 SGEMM "strategy" class.
 //
 // This describes the characteristics of a family of kernels, in terms of
 // the required interleave properties and the output block size.
@@ -44,7 +44,7 @@ void a64_sgemm_asimd_12x8_x1(const float *, const float *, float *, int, int, in
 // All kernels in the family must share these characteristics.  The actual
 // kernel to be used can be chosen at runtime, based on the CPU_type
 // structure.
-class sgemm_12x8 {
+class cls_a64_sgemm_8x12 {
 public:
     typedef float operand_type;
     typedef float result_type;
@@ -83,25 +83,25 @@ public:
         }
     }
 
-    kern_type kernel=a64_sgemm_asimd_12x8;
+    kern_type kernel=a64_sgemm_asimd_8x12;
 
-    sgemm_12x8(const CPUInfo *ci) {
+    cls_a64_sgemm_8x12(const CPUInfo *ci) {
         // Select specific kernel if available
         switch(ci->get_cpu_model()) {
             case CPUModel::A53:
-                kernel = a64_sgemm_asimd_12x8_a53;
+                kernel = a64_sgemm_asimd_8x12_a53;
                 break;
 
             case CPUModel::A55r0:
-                kernel = a64_sgemm_asimd_12x8_a55;
+                kernel = a64_sgemm_asimd_8x12_a55;
                 break;
 
             case CPUModel::A55r1:
-                kernel = a64_sgemm_asimd_12x8_a55r1;
+                kernel = a64_sgemm_asimd_8x12_a55r1;
                 break;
 
             case CPUModel::X1:
-                kernel = a64_sgemm_asimd_12x8_x1;
+                kernel = a64_sgemm_asimd_8x12_x1;
                 break;
 
             default:
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp
index 5532485efb..f4b6e7b70f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp
index e9f071f7f4..5f86da8ef3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp
index 8a6fbacfad..7709ad1be6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K) {
+void a64_sgemm_asimd_8x12_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp
index 48dc46785e..dc72095a9b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp
@@ -39,7 +39,7 @@
 
 namespace arm_gemm {
 
-void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp
index 63fdf4df9f..89f8ac2d6c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp
@@ -39,7 +39,7 @@
 
 namespace arm_gemm {
 
-void a64_sgemm_asimd_12x8_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp
index 6f31efe6cb..5f7252f019 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp
@@ -25,13 +25,15 @@
 
 #ifdef __aarch64__
 
+
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_smallK_hybrid_fp32_mla_4x6(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_smallK_hybrid_fp32_mla_6x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
 
-class smallK_hybrid_fp32_mla_4x6
+class cls_a64_smallK_hybrid_fp32_mla_6x4
 {
 public:
     typedef float operand_type;
@@ -73,9 +75,9 @@ public:
     StdTransformsFixed<operand_type, result_type, 6, 4, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_smallK_hybrid_fp32_mla_4x6;
+    kern_type kernel=a64_smallK_hybrid_fp32_mla_6x4;
 
-    smallK_hybrid_fp32_mla_4x6(const CPUInfo *)
+    cls_a64_smallK_hybrid_fp32_mla_6x4(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
index e2fec6af16..52548b462c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_fp32_mla_4x6(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
+void a64_smallK_hybrid_fp32_mla_6x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(float);
     const long ldcb = ldc * sizeof(float);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp
index e9a094855a..a8e0c24eae 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp
@@ -25,13 +25,15 @@
 
 #ifdef __aarch64__
 
+
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_smallK_hybrid_fp32_mla_4x8(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_smallK_hybrid_fp32_mla_8x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
 
-class smallK_hybrid_fp32_mla_4x8
+class cls_a64_smallK_hybrid_fp32_mla_8x4
 {
 public:
     typedef float operand_type;
@@ -73,9 +75,9 @@ public:
     StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_smallK_hybrid_fp32_mla_4x8;
+    kern_type kernel=a64_smallK_hybrid_fp32_mla_8x4;
 
-    smallK_hybrid_fp32_mla_4x8(const CPUInfo *)
+    cls_a64_smallK_hybrid_fp32_mla_8x4(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
index 11888bce74..deaef27ee9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
+void a64_smallK_hybrid_fp32_mla_8x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(float);
     const long ldcb = ldc * sizeof(float);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp
index fc087b73db..abf0eda008 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_6x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_6x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
 
-class smallK_hybrid_s8s32_dot_4x6
+class cls_a64_smallK_hybrid_s8s32_dot_6x4
 {
 public:
     typedef int8_t operand_type;
@@ -76,12 +76,12 @@ public:
     StdTransformsFixed<operand_type, result_type, 6, 4, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_smallK_hybrid_s8s32_dot_4x6;
+    kern_type kernel=a64_smallK_hybrid_s8s32_dot_6x4;
 
-    smallK_hybrid_s8s32_dot_4x6(const CPUInfo *ci)
+    cls_a64_smallK_hybrid_s8s32_dot_6x4(const CPUInfo *ci)
     {
         if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_smallK_hybrid_s8s32_dot_4x6_a55;
+            kernel = a64_smallK_hybrid_s8s32_dot_6x4_a55;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp
index 2d6d2f064c..a9926602fc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_6x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(int8_t);
     const long ldcb = ldc * sizeof(int32_t);
@@ -97,6 +97,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -107,18 +108,29 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2]\n"
                     "ldr s11, [a_ptr3]\n"
                     "ldr s14, [a_ptr4]\n"
                     "ldr s17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b2, [%[a_ptr0]]\n"
                     "ldr b5, [a_ptr1]\n"
@@ -145,40 +157,42 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v14.b}[2], [a_ptr4]\n"
                     "ld1 {v17.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
@@ -222,173 +236,219 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
-                    "ins v23.d[1], temploadreg3\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v22.d[1], temploadreg2\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
@@ -435,19 +495,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -514,6 +569,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -524,24 +580,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2]\n"
                     "ldr d11, [a_ptr3]\n"
                     "ldr d14, [a_ptr4]\n"
                     "ldr d17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr s2, [%[a_ptr0]], #0x4\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1], #0x4\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2], #0x4\n"
                     "ldr s11, [a_ptr3], #0x4\n"
                     "ldr s14, [a_ptr4], #0x4\n"
                     "ldr s17, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[4], [a_ptr1]\n"
@@ -568,38 +635,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v14.b}[6], [a_ptr4]\n"
                     "ld1 {v17.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -652,180 +721,233 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
                     ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
-                    "ins v22.d[1], temploadreg2\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v23.d[1], temploadreg3\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -881,19 +1003,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1014,38 +1131,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v14.b}[10], [a_ptr4]\n"
                     "ld1 {v17.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1105,189 +1224,249 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
                     ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
-                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
-                    "ins v23.d[1], temploadreg3\n"
-                    "ins v20.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1350,19 +1529,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1429,6 +1603,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -1441,7 +1616,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q10, [a_ptr3], #0x10\n"
                     "ldr q13, [a_ptr4], #0x10\n"
                     "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q2, [%[a_ptr0]]\n"
                     "ldr q5, [a_ptr1]\n"
                     "ldr q8, [a_ptr2]\n"
@@ -1450,8 +1624,21 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr d2, [%[a_ptr0]], #0x8\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1], #0x8\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2], #0x8\n"
                     "ldr d11, [a_ptr3], #0x8\n"
                     "ldr d14, [a_ptr4], #0x8\n"
@@ -1462,7 +1649,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v11.s}[2], [a_ptr3], #4\n"
                     "ld1 {v14.s}[2], [a_ptr4], #4\n"
                     "ld1 {v17.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1489,38 +1675,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v14.b}[14], [a_ptr4]\n"
                     "ld1 {v17.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1587,198 +1775,265 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    "ldr d21, [%[b_ptr0], #0x30]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+                    ".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
+                    ".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
+                    ".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
+                    ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
+                    ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
+                    ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
                     ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
                     ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
-                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
-                    "ins v23.d[1], temploadreg3\n"
                     ".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
                     ".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "ins v21.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1848,19 +2103,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1927,6 +2177,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -1943,18 +2194,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2]\n"
                     "ldr s15, [a_ptr3]\n"
                     "ldr s19, [a_ptr4]\n"
                     "ldr s23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b3, [%[a_ptr0]]\n"
                     "ldr b7, [a_ptr1]\n"
@@ -1981,24 +2249,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v19.b}[2], [a_ptr4]\n"
                     "ld1 {v23.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2091,57 +2361,55 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "ins v25.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q29, [c_ptr3]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
                     ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2235,27 +2503,23 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
                     ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
                     ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
                     ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -2366,19 +2630,117 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2445,6 +2807,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -2461,24 +2824,41 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2]\n"
                     "ldr d15, [a_ptr3]\n"
                     "ldr d19, [a_ptr4]\n"
                     "ldr d23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr s3, [%[a_ptr0]], #0x4\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1], #0x4\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2], #0x4\n"
                     "ldr s15, [a_ptr3], #0x4\n"
                     "ldr s19, [a_ptr4], #0x4\n"
                     "ldr s23, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2505,24 +2885,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v19.b}[6], [a_ptr4]\n"
                     "ld1 {v23.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2622,68 +3004,66 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
                     ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
@@ -2775,27 +3155,23 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
                     ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
                     ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -2913,19 +3289,124 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3052,24 +3533,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v19.b}[10], [a_ptr4]\n"
                     "ld1 {v23.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3177,57 +3660,55 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "ins v25.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q29, [c_ptr3]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
                     ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
@@ -3340,27 +3821,23 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
                     ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -3486,19 +3963,132 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+                    ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3566,6 +4156,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -3584,7 +4175,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q14, [a_ptr3], #0x10\n"
                     "ldr q18, [a_ptr4], #0x10\n"
                     "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q3, [%[a_ptr0]]\n"
                     "ldr q7, [a_ptr1]\n"
                     "ldr q11, [a_ptr2]\n"
@@ -3593,8 +4183,27 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr d3, [%[a_ptr0]], #0x8\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1], #0x8\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2], #0x8\n"
                     "ldr d15, [a_ptr3], #0x8\n"
                     "ldr d19, [a_ptr4], #0x8\n"
@@ -3605,7 +4214,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v15.s}[2], [a_ptr3], #4\n"
                     "ld1 {v19.s}[2], [a_ptr4], #4\n"
                     "ld1 {v23.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3632,24 +4240,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v19.b}[14], [a_ptr4]\n"
                     "ld1 {v23.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3764,68 +4374,66 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
                     ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
@@ -3936,27 +4544,23 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
                     ".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -4089,19 +4693,139 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+                    ".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
+                    ".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
+                    ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
+                    ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
+                    ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp
index 88ad36a27a..9ff39719f7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_6x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(int8_t);
     const long ldcb = ldc * sizeof(int32_t);
@@ -93,6 +93,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -103,18 +104,29 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2]\n"
                     "ldr s11, [a_ptr3]\n"
                     "ldr s14, [a_ptr4]\n"
                     "ldr s17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b2, [%[a_ptr0]]\n"
                     "ldr b5, [a_ptr1]\n"
@@ -141,40 +153,42 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v14.b}[2], [a_ptr4]\n"
                     "ld1 {v17.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
@@ -218,139 +232,201 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
                     ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
@@ -397,19 +473,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -468,6 +539,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -478,24 +550,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2]\n"
                     "ldr d11, [a_ptr3]\n"
                     "ldr d14, [a_ptr4]\n"
                     "ldr d17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr s2, [%[a_ptr0]], #0x4\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1], #0x4\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2], #0x4\n"
                     "ldr s11, [a_ptr3], #0x4\n"
                     "ldr s14, [a_ptr4], #0x4\n"
                     "ldr s17, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[4], [a_ptr1]\n"
@@ -522,38 +605,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v14.b}[6], [a_ptr4]\n"
                     "ld1 {v17.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -606,144 +691,213 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
                     ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -799,19 +953,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -924,38 +1073,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v14.b}[10], [a_ptr4]\n"
                     "ld1 {v17.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1015,62 +1166,60 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1081,85 +1230,163 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
                     ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1222,19 +1449,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1293,6 +1515,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -1305,7 +1528,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q10, [a_ptr3], #0x10\n"
                     "ldr q13, [a_ptr4], #0x10\n"
                     "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q2, [%[a_ptr0]]\n"
                     "ldr q5, [a_ptr1]\n"
                     "ldr q8, [a_ptr2]\n"
@@ -1314,8 +1536,21 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr d2, [%[a_ptr0]], #0x8\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1], #0x8\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2], #0x8\n"
                     "ldr d11, [a_ptr3], #0x8\n"
                     "ldr d14, [a_ptr4], #0x8\n"
@@ -1326,7 +1561,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v11.s}[2], [a_ptr3], #4\n"
                     "ld1 {v14.s}[2], [a_ptr4], #4\n"
                     "ld1 {v17.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1353,38 +1587,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v14.b}[14], [a_ptr4]\n"
                     "ld1 {v17.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1451,62 +1687,60 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1524,85 +1758,170 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
                     ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
                     ".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
                     ".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
                     ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+                    ".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
+                    ".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
+                    ".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
+                    ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
+                    ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
+                    ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1672,19 +1991,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1743,6 +2057,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -1759,18 +2074,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2]\n"
                     "ldr s15, [a_ptr3]\n"
                     "ldr s19, [a_ptr4]\n"
                     "ldr s23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b3, [%[a_ptr0]]\n"
                     "ldr b7, [a_ptr1]\n"
@@ -1797,24 +2129,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v19.b}[2], [a_ptr4]\n"
                     "ld1 {v23.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -1907,38 +2241,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2028,20 +2360,20 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
                     ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
                     ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
                     ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
                     ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -2152,19 +2484,117 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2223,6 +2653,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -2239,24 +2670,41 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2]\n"
                     "ldr d15, [a_ptr3]\n"
                     "ldr d19, [a_ptr4]\n"
                     "ldr d23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr s3, [%[a_ptr0]], #0x4\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1], #0x4\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2], #0x4\n"
                     "ldr s15, [a_ptr3], #0x4\n"
                     "ldr s19, [a_ptr4], #0x4\n"
                     "ldr s23, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2283,24 +2731,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v19.b}[6], [a_ptr4]\n"
                     "ld1 {v23.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2400,38 +2850,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "str q27, [c_ptr1]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2528,7 +2976,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
                     ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
                     ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
@@ -2540,6 +2987,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
@@ -2659,19 +3107,124 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2790,24 +3343,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v19.b}[10], [a_ptr4]\n"
                     "ld1 {v23.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2915,38 +3470,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3051,20 +3604,20 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
                     ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -3190,19 +3743,132 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+                    ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3262,6 +3928,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -3280,7 +3947,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q14, [a_ptr3], #0x10\n"
                     "ldr q18, [a_ptr4], #0x10\n"
                     "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q3, [%[a_ptr0]]\n"
                     "ldr q7, [a_ptr1]\n"
                     "ldr q11, [a_ptr2]\n"
@@ -3289,8 +3955,27 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr d3, [%[a_ptr0]], #0x8\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1], #0x8\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2], #0x8\n"
                     "ldr d15, [a_ptr3], #0x8\n"
                     "ldr d19, [a_ptr4], #0x8\n"
@@ -3301,7 +3986,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v15.s}[2], [a_ptr3], #4\n"
                     "ld1 {v19.s}[2], [a_ptr4], #4\n"
                     "ld1 {v23.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3328,24 +4012,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v19.b}[14], [a_ptr4]\n"
                     "ld1 {v23.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3460,38 +4146,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "str q27, [c_ptr1]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3603,7 +4287,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
                     ".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
@@ -3615,6 +4298,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
@@ -3749,19 +4433,139 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+                    ".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
+                    ".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
+                    ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
+                    ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
+                    ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp
index 3de708cc68..9f9c2a49db 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_8x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_8x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
 
-class smallK_hybrid_s8s32_dot_4x8
+class cls_a64_smallK_hybrid_s8s32_dot_8x4
 {
 public:
     typedef int8_t operand_type;
@@ -76,12 +76,12 @@ public:
     StdTransformsFixed<operand_type, result_type, 8, 4, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_smallK_hybrid_s8s32_dot_4x8;
+    kern_type kernel=a64_smallK_hybrid_s8s32_dot_8x4;
 
-    smallK_hybrid_s8s32_dot_4x8(const CPUInfo *ci)
+    cls_a64_smallK_hybrid_s8s32_dot_8x4(const CPUInfo *ci)
     {
         if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_smallK_hybrid_s8s32_dot_4x8_a55;
+            kernel = a64_smallK_hybrid_s8s32_dot_8x4_a55;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp
index 7135f2eee6..aba6e0d100 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_8x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(int8_t);
     const long ldcb = ldc * sizeof(int32_t);
@@ -157,22 +157,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v6.b}[2], [a_ptr6]\n"
                     "ld1 {v7.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "movi v26.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "movi v27.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "movi v28.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "movi v29.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "movi v30.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "movi v31.4s, #0\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
@@ -181,55 +183,49 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                    "ins v16.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "movi v26.4s, #0\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    "movi v26.4s, #0\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
                     "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
-                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
@@ -239,6 +235,8 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
@@ -268,23 +266,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -423,24 +432,26 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v6.b}[6], [a_ptr6]\n"
                     "ld1 {v7.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
                     "movi v26.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
@@ -456,78 +467,72 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v16.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v17.d[1], temploadreg1\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "ins v17.d[1], temploadreg1\n"
                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v17.d[1], temploadreg1\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
@@ -565,23 +570,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -720,26 +744,28 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v6.b}[10], [a_ptr6]\n"
                     "ld1 {v7.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
@@ -762,95 +788,86 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ins v17.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
                     "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
                     ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
@@ -876,8 +893,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -893,23 +911,50 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1056,28 +1101,30 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v6.b}[14], [a_ptr6]\n"
                     "ld1 {v7.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1107,112 +1154,101 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ins v18.d[1], temploadreg2\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v19.d[1], temploadreg3\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
                     ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
                     ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
                     ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v19.d[1], temploadreg3\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1235,8 +1271,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1260,23 +1297,58 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
+                    ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
+                    ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
+                    ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1363,26 +1435,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s9, [a_ptr4]\n"
                     "ldr s11, [a_ptr5]\n"
                     "ldr s13, [a_ptr6]\n"
                     "ldr s15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b1, [%[a_ptr0]]\n"
                     "ldr b3, [a_ptr1]\n"
@@ -1415,30 +1495,32 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v13.b}[2], [a_ptr6]\n"
                     "ld1 {v15.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1475,126 +1557,113 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ins v19.d[1], temploadreg3\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v20.d[1], temploadreg0\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v20.d[1], temploadreg0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1617,8 +1686,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1650,23 +1720,66 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1753,34 +1866,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d9, [a_ptr4]\n"
                     "ldr d11, [a_ptr5]\n"
                     "ldr d13, [a_ptr6]\n"
                     "ldr d15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr s1, [%[a_ptr0]], #0x4\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr s3, [a_ptr1], #0x4\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr2], #0x4\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s7, [a_ptr3], #0x4\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s9, [a_ptr4], #0x4\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s11, [a_ptr5], #0x4\n"
                     "ldr s13, [a_ptr6], #0x4\n"
                     "ldr s15, [a_ptr7], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1813,32 +1934,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v13.b}[6], [a_ptr6]\n"
                     "ld1 {v15.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1882,146 +2005,132 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ins v20.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v21.d[1], temploadreg1\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr d21, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v21.d[1], temploadreg1\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
@@ -2038,7 +2147,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
@@ -2079,23 +2188,74 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2242,34 +2402,36 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v13.b}[10], [a_ptr6]\n"
                     "ld1 {v15.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2320,178 +2482,162 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ins v20.d[1], temploadreg0\n"
-                    "ins v21.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v22.d[1], temploadreg2\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr d21, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ldr d22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
                     ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v22.d[1], temploadreg2\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2539,23 +2685,82 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2643,6 +2848,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
@@ -2651,7 +2857,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ldr q10, [a_ptr5], #0x10\n"
                     "ldr q12, [a_ptr6], #0x10\n"
                     "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q1, [%[a_ptr0]]\n"
                     "ldr q3, [a_ptr1]\n"
                     "ldr q5, [a_ptr2]\n"
@@ -2662,15 +2867,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ldr q15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr d1, [%[a_ptr0]], #0x8\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr d3, [a_ptr1], #0x8\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr2], #0x8\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d7, [a_ptr3], #0x8\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d9, [a_ptr4], #0x8\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d11, [a_ptr5], #0x8\n"
                     "ldr d13, [a_ptr6], #0x8\n"
-                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v3.s}[2], [a_ptr1], #4\n"
                     "ld1 {v5.s}[2], [a_ptr2], #4\n"
                     "ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2678,7 +2892,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v11.s}[2], [a_ptr5], #4\n"
                     "ld1 {v13.s}[2], [a_ptr6], #4\n"
                     "ld1 {v15.s}[2], [a_ptr7], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2711,36 +2924,38 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v13.b}[14], [a_ptr6]\n"
                     "ld1 {v15.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "movi v31.4s, #0\n"
                     "ldr q23, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2798,192 +3013,248 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                    "ldr d23, [%[b_ptr0], #0x70]\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v20.d[1], temploadreg0\n"
-                    "ins v21.d[1], temploadreg1\n"
-                    "ins v22.d[1], temploadreg2\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v23.d[1], temploadreg3\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr d21, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ldr d22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr d23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ins v23.d[1], temploadreg3\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
                     ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
                     ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
                     ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
-                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
                     ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
-                    "ldr d23, [%[b_ptr0], #0x70]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v23.d[1], temploadreg3\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+                    ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
+                    ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
+                    ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
+                    ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
+                    ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
+                    ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
+                    ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
+                    ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -3039,23 +3310,16 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp
index c94e975754..7fcf853d2e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_8x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(int8_t);
     const long ldcb = ldc * sizeof(int32_t);
@@ -153,22 +153,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v6.b}[2], [a_ptr6]\n"
                     "ld1 {v7.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "movi v26.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "movi v27.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "movi v28.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "movi v29.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "movi v30.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "movi v31.4s, #0\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
@@ -177,20 +179,17 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
@@ -216,10 +215,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -231,6 +229,8 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
@@ -260,23 +260,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -407,24 +418,26 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v6.b}[6], [a_ptr6]\n"
                     "ld1 {v7.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
                     "movi v26.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
@@ -440,68 +453,66 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "str q25, [c_ptr1]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q25, [c_ptr1]\n"
@@ -541,23 +552,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -688,26 +718,28 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v6.b}[10], [a_ptr6]\n"
                     "ld1 {v7.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
@@ -730,49 +762,46 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
@@ -788,7 +817,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
@@ -802,11 +830,12 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
@@ -832,8 +861,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -849,23 +879,50 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1004,28 +1061,30 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v6.b}[14], [a_ptr6]\n"
                     "ld1 {v7.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1055,50 +1114,47 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
@@ -1114,7 +1170,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
@@ -1123,7 +1178,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
                     ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
@@ -1137,14 +1191,16 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1167,8 +1223,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1192,23 +1249,58 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
+                    ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
+                    ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
+                    ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1287,26 +1379,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s9, [a_ptr4]\n"
                     "ldr s11, [a_ptr5]\n"
                     "ldr s13, [a_ptr6]\n"
                     "ldr s15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b1, [%[a_ptr0]]\n"
                     "ldr b3, [a_ptr1]\n"
@@ -1339,30 +1439,32 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v13.b}[2], [a_ptr6]\n"
                     "ld1 {v15.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1399,51 +1501,48 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
@@ -1459,7 +1558,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1468,7 +1566,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1477,7 +1574,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1491,14 +1587,17 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1521,8 +1620,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1554,23 +1654,66 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1649,34 +1792,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d9, [a_ptr4]\n"
                     "ldr d11, [a_ptr5]\n"
                     "ldr d13, [a_ptr6]\n"
                     "ldr d15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr s1, [%[a_ptr0]], #0x4\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr s3, [a_ptr1], #0x4\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr2], #0x4\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s7, [a_ptr3], #0x4\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s9, [a_ptr4], #0x4\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s11, [a_ptr5], #0x4\n"
                     "ldr s13, [a_ptr6], #0x4\n"
                     "ldr s15, [a_ptr7], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1709,32 +1860,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v13.b}[6], [a_ptr6]\n"
                     "ld1 {v15.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1778,68 +1931,64 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1848,7 +1997,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1857,7 +2005,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1866,7 +2013,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
@@ -1880,20 +2026,25 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
@@ -1910,7 +2061,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
@@ -1951,23 +2102,74 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2106,34 +2308,36 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v13.b}[10], [a_ptr6]\n"
                     "ld1 {v15.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2184,34 +2388,31 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
@@ -2230,24 +2431,23 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
@@ -2255,7 +2455,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2264,7 +2463,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2273,7 +2471,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2282,7 +2479,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2296,38 +2492,44 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2375,23 +2577,82 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2471,6 +2732,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
@@ -2479,7 +2741,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ldr q10, [a_ptr5], #0x10\n"
                     "ldr q12, [a_ptr6], #0x10\n"
                     "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q1, [%[a_ptr0]]\n"
                     "ldr q3, [a_ptr1]\n"
                     "ldr q5, [a_ptr2]\n"
@@ -2490,15 +2751,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ldr q15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr d1, [%[a_ptr0]], #0x8\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr d3, [a_ptr1], #0x8\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr2], #0x8\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d7, [a_ptr3], #0x8\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d9, [a_ptr4], #0x8\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d11, [a_ptr5], #0x8\n"
                     "ldr d13, [a_ptr6], #0x8\n"
-                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v3.s}[2], [a_ptr1], #4\n"
                     "ld1 {v5.s}[2], [a_ptr2], #4\n"
                     "ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2506,7 +2776,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v11.s}[2], [a_ptr5], #4\n"
                     "ld1 {v13.s}[2], [a_ptr6], #4\n"
                     "ld1 {v15.s}[2], [a_ptr7], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2539,36 +2808,38 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v13.b}[14], [a_ptr6]\n"
                     "ld1 {v15.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "movi v31.4s, #0\n"
                     "ldr q23, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2626,39 +2897,37 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q23, [%[b_ptr0], #0x70]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
@@ -2673,32 +2942,29 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2707,7 +2973,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2716,7 +2981,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2725,7 +2989,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2734,7 +2997,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
                     ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
                     ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
@@ -2748,38 +3010,119 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q23, [%[b_ptr0], #0x70]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+                    ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
+                    ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
+                    ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
+                    ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
+                    ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
+                    ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
+                    ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
+                    ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2835,23 +3178,16 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp
index 76931db4dd..5d48a52d42 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_6x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_6x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
 
-class smallK_hybrid_u8u32_dot_4x6
+class cls_a64_smallK_hybrid_u8u32_dot_6x4
 {
 public:
     typedef uint8_t operand_type;
@@ -76,12 +76,12 @@ public:
     StdTransformsFixed<operand_type, result_type, 6, 4, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_smallK_hybrid_u8u32_dot_4x6;
+    kern_type kernel=a64_smallK_hybrid_u8u32_dot_6x4;
 
-    smallK_hybrid_u8u32_dot_4x6(const CPUInfo *ci)
+    cls_a64_smallK_hybrid_u8u32_dot_6x4(const CPUInfo *ci)
     {
         if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_smallK_hybrid_u8u32_dot_4x6_a55;
+            kernel = a64_smallK_hybrid_u8u32_dot_6x4_a55;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp
index 02894d8327..dddf4c5aa2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_6x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(uint8_t);
     const long ldcb = ldc * sizeof(uint32_t);
@@ -97,6 +97,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -107,18 +108,29 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2]\n"
                     "ldr s11, [a_ptr3]\n"
                     "ldr s14, [a_ptr4]\n"
                     "ldr s17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b2, [%[a_ptr0]]\n"
                     "ldr b5, [a_ptr1]\n"
@@ -145,40 +157,42 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v14.b}[2], [a_ptr4]\n"
                     "ld1 {v17.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
@@ -222,173 +236,219 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
-                    "ins v23.d[1], temploadreg3\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v22.d[1], temploadreg2\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
@@ -435,19 +495,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -514,6 +569,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -524,24 +580,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2]\n"
                     "ldr d11, [a_ptr3]\n"
                     "ldr d14, [a_ptr4]\n"
                     "ldr d17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr s2, [%[a_ptr0]], #0x4\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1], #0x4\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2], #0x4\n"
                     "ldr s11, [a_ptr3], #0x4\n"
                     "ldr s14, [a_ptr4], #0x4\n"
                     "ldr s17, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[4], [a_ptr1]\n"
@@ -568,38 +635,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v14.b}[6], [a_ptr4]\n"
                     "ld1 {v17.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -652,180 +721,233 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
                     ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
-                    "ins v22.d[1], temploadreg2\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v23.d[1], temploadreg3\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -881,19 +1003,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1014,38 +1131,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v14.b}[10], [a_ptr4]\n"
                     "ld1 {v17.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1105,189 +1224,249 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
                     ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
-                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
-                    "ins v23.d[1], temploadreg3\n"
-                    "ins v20.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1350,19 +1529,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1429,6 +1603,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -1441,7 +1616,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q10, [a_ptr3], #0x10\n"
                     "ldr q13, [a_ptr4], #0x10\n"
                     "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q2, [%[a_ptr0]]\n"
                     "ldr q5, [a_ptr1]\n"
                     "ldr q8, [a_ptr2]\n"
@@ -1450,8 +1624,21 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr d2, [%[a_ptr0]], #0x8\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1], #0x8\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2], #0x8\n"
                     "ldr d11, [a_ptr3], #0x8\n"
                     "ldr d14, [a_ptr4], #0x8\n"
@@ -1462,7 +1649,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v11.s}[2], [a_ptr3], #4\n"
                     "ld1 {v14.s}[2], [a_ptr4], #4\n"
                     "ld1 {v17.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1489,38 +1675,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v14.b}[14], [a_ptr4]\n"
                     "ld1 {v17.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1587,198 +1775,265 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    "ldr d21, [%[b_ptr0], #0x30]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+                    ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
+                    ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
+                    ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
+                    ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
+                    ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
+                    ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
                     ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
                     ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
-                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
-                    "ins v23.d[1], temploadreg3\n"
                     ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
                     ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "ins v21.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1848,19 +2103,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1927,6 +2177,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -1943,18 +2194,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2]\n"
                     "ldr s15, [a_ptr3]\n"
                     "ldr s19, [a_ptr4]\n"
                     "ldr s23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b3, [%[a_ptr0]]\n"
                     "ldr b7, [a_ptr1]\n"
@@ -1981,24 +2249,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v19.b}[2], [a_ptr4]\n"
                     "ld1 {v23.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2091,57 +2361,55 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "ins v25.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q29, [c_ptr3]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
                     ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2235,27 +2503,23 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
                     ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
                     ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
                     ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -2366,19 +2630,117 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2445,6 +2807,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -2461,24 +2824,41 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2]\n"
                     "ldr d15, [a_ptr3]\n"
                     "ldr d19, [a_ptr4]\n"
                     "ldr d23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr s3, [%[a_ptr0]], #0x4\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1], #0x4\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2], #0x4\n"
                     "ldr s15, [a_ptr3], #0x4\n"
                     "ldr s19, [a_ptr4], #0x4\n"
                     "ldr s23, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2505,24 +2885,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v19.b}[6], [a_ptr4]\n"
                     "ld1 {v23.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2622,68 +3004,66 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
                     ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
@@ -2775,27 +3155,23 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
                     ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
                     ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -2913,19 +3289,124 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3052,24 +3533,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v19.b}[10], [a_ptr4]\n"
                     "ld1 {v23.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3177,57 +3660,55 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "ins v25.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q29, [c_ptr3]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
                     ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
@@ -3340,27 +3821,23 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
                     ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -3486,19 +3963,132 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+                    ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3566,6 +4156,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -3584,7 +4175,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q14, [a_ptr3], #0x10\n"
                     "ldr q18, [a_ptr4], #0x10\n"
                     "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q3, [%[a_ptr0]]\n"
                     "ldr q7, [a_ptr1]\n"
                     "ldr q11, [a_ptr2]\n"
@@ -3593,8 +4183,27 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr d3, [%[a_ptr0]], #0x8\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1], #0x8\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2], #0x8\n"
                     "ldr d15, [a_ptr3], #0x8\n"
                     "ldr d19, [a_ptr4], #0x8\n"
@@ -3605,7 +4214,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v15.s}[2], [a_ptr3], #4\n"
                     "ld1 {v19.s}[2], [a_ptr4], #4\n"
                     "ld1 {v23.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3632,24 +4240,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v19.b}[14], [a_ptr4]\n"
                     "ld1 {v23.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3764,68 +4374,66 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
                     ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
@@ -3936,27 +4544,23 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
                     ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -4089,19 +4693,139 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+                    ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
+                    ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
+                    ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
+                    ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
+                    ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp
index fe69f744e2..10bd16aa59 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_6x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(uint8_t);
     const long ldcb = ldc * sizeof(uint32_t);
@@ -93,6 +93,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -103,18 +104,29 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2]\n"
                     "ldr s11, [a_ptr3]\n"
                     "ldr s14, [a_ptr4]\n"
                     "ldr s17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b2, [%[a_ptr0]]\n"
                     "ldr b5, [a_ptr1]\n"
@@ -141,40 +153,42 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v14.b}[2], [a_ptr4]\n"
                     "ld1 {v17.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
@@ -218,139 +232,201 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
                     ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
@@ -397,19 +473,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -468,6 +539,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -478,24 +550,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2]\n"
                     "ldr d11, [a_ptr3]\n"
                     "ldr d14, [a_ptr4]\n"
                     "ldr d17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr s2, [%[a_ptr0]], #0x4\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1], #0x4\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2], #0x4\n"
                     "ldr s11, [a_ptr3], #0x4\n"
                     "ldr s14, [a_ptr4], #0x4\n"
                     "ldr s17, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[4], [a_ptr1]\n"
@@ -522,38 +605,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v14.b}[6], [a_ptr4]\n"
                     "ld1 {v17.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -606,144 +691,213 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
                     ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -799,19 +953,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -924,38 +1073,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v14.b}[10], [a_ptr4]\n"
                     "ld1 {v17.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1015,62 +1166,60 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1081,85 +1230,163 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
                     ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1222,19 +1449,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1293,6 +1515,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -1305,7 +1528,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q10, [a_ptr3], #0x10\n"
                     "ldr q13, [a_ptr4], #0x10\n"
                     "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q2, [%[a_ptr0]]\n"
                     "ldr q5, [a_ptr1]\n"
                     "ldr q8, [a_ptr2]\n"
@@ -1314,8 +1536,21 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr d2, [%[a_ptr0]], #0x8\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1], #0x8\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2], #0x8\n"
                     "ldr d11, [a_ptr3], #0x8\n"
                     "ldr d14, [a_ptr4], #0x8\n"
@@ -1326,7 +1561,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v11.s}[2], [a_ptr3], #4\n"
                     "ld1 {v14.s}[2], [a_ptr4], #4\n"
                     "ld1 {v17.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1353,38 +1587,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v14.b}[14], [a_ptr4]\n"
                     "ld1 {v17.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1451,62 +1687,60 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1524,85 +1758,170 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
                     ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
                     ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
                     ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
                     ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+                    ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
+                    ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
+                    ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
+                    ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
+                    ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
+                    ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1672,19 +1991,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1743,6 +2057,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -1759,18 +2074,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2]\n"
                     "ldr s15, [a_ptr3]\n"
                     "ldr s19, [a_ptr4]\n"
                     "ldr s23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b3, [%[a_ptr0]]\n"
                     "ldr b7, [a_ptr1]\n"
@@ -1797,24 +2129,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v19.b}[2], [a_ptr4]\n"
                     "ld1 {v23.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -1907,38 +2241,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2028,20 +2360,20 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
                     ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
                     ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
                     ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
                     ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -2152,19 +2484,117 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2223,6 +2653,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -2239,24 +2670,41 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2]\n"
                     "ldr d15, [a_ptr3]\n"
                     "ldr d19, [a_ptr4]\n"
                     "ldr d23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr s3, [%[a_ptr0]], #0x4\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1], #0x4\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2], #0x4\n"
                     "ldr s15, [a_ptr3], #0x4\n"
                     "ldr s19, [a_ptr4], #0x4\n"
                     "ldr s23, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2283,24 +2731,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v19.b}[6], [a_ptr4]\n"
                     "ld1 {v23.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2400,38 +2850,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "str q27, [c_ptr1]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2528,7 +2976,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
                     ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
                     ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
@@ -2540,6 +2987,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
@@ -2659,19 +3107,124 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2790,24 +3343,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v19.b}[10], [a_ptr4]\n"
                     "ld1 {v23.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2915,38 +3470,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3051,20 +3604,20 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
                     ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -3190,19 +3743,132 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+                    ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3262,6 +3928,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -3280,7 +3947,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q14, [a_ptr3], #0x10\n"
                     "ldr q18, [a_ptr4], #0x10\n"
                     "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q3, [%[a_ptr0]]\n"
                     "ldr q7, [a_ptr1]\n"
                     "ldr q11, [a_ptr2]\n"
@@ -3289,8 +3955,27 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr d3, [%[a_ptr0]], #0x8\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1], #0x8\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2], #0x8\n"
                     "ldr d15, [a_ptr3], #0x8\n"
                     "ldr d19, [a_ptr4], #0x8\n"
@@ -3301,7 +3986,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v15.s}[2], [a_ptr3], #4\n"
                     "ld1 {v19.s}[2], [a_ptr4], #4\n"
                     "ld1 {v23.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3328,24 +4012,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v19.b}[14], [a_ptr4]\n"
                     "ld1 {v23.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3460,38 +4146,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "str q27, [c_ptr1]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3603,7 +4287,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
                     ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
@@ -3615,6 +4298,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
@@ -3749,19 +4433,139 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+                    ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
+                    ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
+                    ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
+                    ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
+                    ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp
index d91416c3be..942f94b0bf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_8x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_8x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
 
-class smallK_hybrid_u8u32_dot_4x8
+class cls_a64_smallK_hybrid_u8u32_dot_8x4
 {
 public:
     typedef uint8_t operand_type;
@@ -76,12 +76,12 @@ public:
     StdTransformsFixed<operand_type, result_type, 8, 4, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_smallK_hybrid_u8u32_dot_4x8;
+    kern_type kernel=a64_smallK_hybrid_u8u32_dot_8x4;
 
-    smallK_hybrid_u8u32_dot_4x8(const CPUInfo *ci)
+    cls_a64_smallK_hybrid_u8u32_dot_8x4(const CPUInfo *ci)
     {
         if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_smallK_hybrid_u8u32_dot_4x8_a55;
+            kernel = a64_smallK_hybrid_u8u32_dot_8x4_a55;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp
index e70fb6955e..fcb546f51e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_8x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(uint8_t);
     const long ldcb = ldc * sizeof(uint32_t);
@@ -157,22 +157,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v6.b}[2], [a_ptr6]\n"
                     "ld1 {v7.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "movi v26.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "movi v27.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "movi v28.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "movi v29.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "movi v30.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "movi v31.4s, #0\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
@@ -181,55 +183,49 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                    "ins v16.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "movi v26.4s, #0\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    "movi v26.4s, #0\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
                     "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
-                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
@@ -239,6 +235,8 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
@@ -268,23 +266,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -423,24 +432,26 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v6.b}[6], [a_ptr6]\n"
                     "ld1 {v7.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
                     "movi v26.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
@@ -456,78 +467,72 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v16.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v17.d[1], temploadreg1\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "ins v17.d[1], temploadreg1\n"
                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v17.d[1], temploadreg1\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
@@ -565,23 +570,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -720,26 +744,28 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v6.b}[10], [a_ptr6]\n"
                     "ld1 {v7.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
@@ -762,95 +788,86 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ins v17.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
                     "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
                     ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
@@ -876,8 +893,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -893,23 +911,50 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1056,28 +1101,30 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v6.b}[14], [a_ptr6]\n"
                     "ld1 {v7.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1107,112 +1154,101 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ins v18.d[1], temploadreg2\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v19.d[1], temploadreg3\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
                     ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
                     ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
                     ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
                     ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v19.d[1], temploadreg3\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1235,8 +1271,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1260,23 +1297,58 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
+                    ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
+                    ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
+                    ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1363,26 +1435,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s9, [a_ptr4]\n"
                     "ldr s11, [a_ptr5]\n"
                     "ldr s13, [a_ptr6]\n"
                     "ldr s15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b1, [%[a_ptr0]]\n"
                     "ldr b3, [a_ptr1]\n"
@@ -1415,30 +1495,32 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v13.b}[2], [a_ptr6]\n"
                     "ld1 {v15.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1475,126 +1557,113 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ins v19.d[1], temploadreg3\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v20.d[1], temploadreg0\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
                     ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v20.d[1], temploadreg0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1617,8 +1686,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1650,23 +1720,66 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1753,34 +1866,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d9, [a_ptr4]\n"
                     "ldr d11, [a_ptr5]\n"
                     "ldr d13, [a_ptr6]\n"
                     "ldr d15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr s1, [%[a_ptr0]], #0x4\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr s3, [a_ptr1], #0x4\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr2], #0x4\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s7, [a_ptr3], #0x4\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s9, [a_ptr4], #0x4\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s11, [a_ptr5], #0x4\n"
                     "ldr s13, [a_ptr6], #0x4\n"
                     "ldr s15, [a_ptr7], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1813,32 +1934,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v13.b}[6], [a_ptr6]\n"
                     "ld1 {v15.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1882,146 +2005,132 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ins v20.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v21.d[1], temploadreg1\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr d21, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                     ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
                     ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
                     ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v21.d[1], temploadreg1\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
@@ -2038,7 +2147,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
@@ -2079,23 +2188,74 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2242,34 +2402,36 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v13.b}[10], [a_ptr6]\n"
                     "ld1 {v15.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2320,178 +2482,162 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ins v20.d[1], temploadreg0\n"
-                    "ins v21.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v22.d[1], temploadreg2\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr d21, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ldr d22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                     ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
                     ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
                     ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
                     ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v22.d[1], temploadreg2\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2539,23 +2685,82 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2643,6 +2848,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
@@ -2651,7 +2857,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q10, [a_ptr5], #0x10\n"
                     "ldr q12, [a_ptr6], #0x10\n"
                     "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q1, [%[a_ptr0]]\n"
                     "ldr q3, [a_ptr1]\n"
                     "ldr q5, [a_ptr2]\n"
@@ -2662,15 +2867,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr d1, [%[a_ptr0]], #0x8\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr d3, [a_ptr1], #0x8\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr2], #0x8\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d7, [a_ptr3], #0x8\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d9, [a_ptr4], #0x8\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d11, [a_ptr5], #0x8\n"
                     "ldr d13, [a_ptr6], #0x8\n"
-                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v3.s}[2], [a_ptr1], #4\n"
                     "ld1 {v5.s}[2], [a_ptr2], #4\n"
                     "ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2678,7 +2892,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v11.s}[2], [a_ptr5], #4\n"
                     "ld1 {v13.s}[2], [a_ptr6], #4\n"
                     "ld1 {v15.s}[2], [a_ptr7], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2711,36 +2924,38 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v13.b}[14], [a_ptr6]\n"
                     "ld1 {v15.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "movi v31.4s, #0\n"
                     "ldr q23, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2798,192 +3013,248 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                    "ldr d23, [%[b_ptr0], #0x70]\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v20.d[1], temploadreg0\n"
-                    "ins v21.d[1], temploadreg1\n"
-                    "ins v22.d[1], temploadreg2\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v23.d[1], temploadreg3\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr d21, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ldr d22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr d23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ins v23.d[1], temploadreg3\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                     ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
                     ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
                     ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
                     ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
                     ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
-                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
                     ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
                     ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
-                    "ldr d23, [%[b_ptr0], #0x70]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v23.d[1], temploadreg3\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+                    ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
+                    ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
+                    ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
+                    ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
+                    ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
+                    ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
+                    ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
+                    ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -3039,23 +3310,16 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp
index 2a7dd3d88d..aeea051662 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_8x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(uint8_t);
     const long ldcb = ldc * sizeof(uint32_t);
@@ -153,22 +153,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v6.b}[2], [a_ptr6]\n"
                     "ld1 {v7.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "movi v26.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "movi v27.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "movi v28.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "movi v29.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "movi v30.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "movi v31.4s, #0\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
@@ -177,20 +179,17 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
@@ -216,10 +215,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -231,6 +229,8 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
@@ -260,23 +260,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -407,24 +418,26 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v6.b}[6], [a_ptr6]\n"
                     "ld1 {v7.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
                     "movi v26.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
@@ -440,68 +453,66 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "str q25, [c_ptr1]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q25, [c_ptr1]\n"
@@ -541,23 +552,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -688,26 +718,28 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v6.b}[10], [a_ptr6]\n"
                     "ld1 {v7.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
@@ -730,49 +762,46 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
@@ -788,7 +817,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
                     ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
@@ -802,11 +830,12 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
@@ -832,8 +861,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -849,23 +879,50 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1004,28 +1061,30 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v6.b}[14], [a_ptr6]\n"
                     "ld1 {v7.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1055,50 +1114,47 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
@@ -1114,7 +1170,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
                     ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
@@ -1123,7 +1178,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
                     ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
@@ -1137,14 +1191,16 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1167,8 +1223,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1192,23 +1249,58 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
+                    ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
+                    ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
+                    ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1287,26 +1379,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s9, [a_ptr4]\n"
                     "ldr s11, [a_ptr5]\n"
                     "ldr s13, [a_ptr6]\n"
                     "ldr s15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b1, [%[a_ptr0]]\n"
                     "ldr b3, [a_ptr1]\n"
@@ -1339,30 +1439,32 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v13.b}[2], [a_ptr6]\n"
                     "ld1 {v15.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1399,51 +1501,48 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
@@ -1459,7 +1558,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1468,7 +1566,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1477,7 +1574,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1491,14 +1587,17 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1521,8 +1620,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1554,23 +1654,66 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1649,34 +1792,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d9, [a_ptr4]\n"
                     "ldr d11, [a_ptr5]\n"
                     "ldr d13, [a_ptr6]\n"
                     "ldr d15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr s1, [%[a_ptr0]], #0x4\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr s3, [a_ptr1], #0x4\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr2], #0x4\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s7, [a_ptr3], #0x4\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s9, [a_ptr4], #0x4\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s11, [a_ptr5], #0x4\n"
                     "ldr s13, [a_ptr6], #0x4\n"
                     "ldr s15, [a_ptr7], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1709,32 +1860,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v13.b}[6], [a_ptr6]\n"
                     "ld1 {v15.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1778,68 +1931,64 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1848,7 +1997,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1857,7 +2005,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1866,7 +2013,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
@@ -1880,20 +2026,25 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
@@ -1910,7 +2061,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
@@ -1951,23 +2102,74 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2106,34 +2308,36 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v13.b}[10], [a_ptr6]\n"
                     "ld1 {v15.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2184,34 +2388,31 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
@@ -2230,24 +2431,23 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
@@ -2255,7 +2455,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2264,7 +2463,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2273,7 +2471,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2282,7 +2479,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2296,38 +2492,44 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2375,23 +2577,82 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2471,6 +2732,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
@@ -2479,7 +2741,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q10, [a_ptr5], #0x10\n"
                     "ldr q12, [a_ptr6], #0x10\n"
                     "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q1, [%[a_ptr0]]\n"
                     "ldr q3, [a_ptr1]\n"
                     "ldr q5, [a_ptr2]\n"
@@ -2490,15 +2751,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr d1, [%[a_ptr0]], #0x8\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr d3, [a_ptr1], #0x8\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr2], #0x8\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d7, [a_ptr3], #0x8\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d9, [a_ptr4], #0x8\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d11, [a_ptr5], #0x8\n"
                     "ldr d13, [a_ptr6], #0x8\n"
-                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v3.s}[2], [a_ptr1], #4\n"
                     "ld1 {v5.s}[2], [a_ptr2], #4\n"
                     "ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2506,7 +2776,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v11.s}[2], [a_ptr5], #4\n"
                     "ld1 {v13.s}[2], [a_ptr6], #4\n"
                     "ld1 {v15.s}[2], [a_ptr7], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2539,36 +2808,38 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v13.b}[14], [a_ptr6]\n"
                     "ld1 {v15.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "movi v31.4s, #0\n"
                     "ldr q23, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2626,39 +2897,37 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q23, [%[b_ptr0], #0x70]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
@@ -2673,32 +2942,29 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2707,7 +2973,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2716,7 +2981,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2725,7 +2989,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2734,7 +2997,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
                     ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
                     ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
@@ -2748,38 +3010,119 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q23, [%[b_ptr0], #0x70]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+                    ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
+                    ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
+                    ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
+                    ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
+                    ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
+                    ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
+                    ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
+                    ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2835,23 +3178,16 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp
index 1bc8021e76..57fd9c909e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp
@@ -23,34 +23,28 @@
  */
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#ifdef __aarch64__
 
+#include "../performance_parameters.hpp"
 #include "../std_transforms_sve.hpp"
 
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_hybrid_fp32_mla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_gemv_fp32_mla_8VL(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
 
-class hybrid_fp32_mla_4VLx4
+class cls_sve_gemv_fp32_mla_8VL
 {
 public:
     typedef float operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 4;
-    }
+    typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
 
     static unsigned int out_width()
     {
-        return get_vector_length<float>() * 4;
+        return 8 * get_vector_length<float>();
     }
 
     static constexpr unsigned int k_unroll()
@@ -60,7 +54,7 @@ public:
 
     static constexpr bool supports_accumulate()
     {
-        return true;
+        return false;
     }
 
     static constexpr bool supports_bias()
@@ -73,17 +67,16 @@ public:
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 1, 8, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_hybrid_fp32_mla_4VLx4;
+    kern_type kernel=sve_gemv_fp32_mla_8VL;
 
-    hybrid_fp32_mla_4VLx4(const CPUInfo *)
+    cls_sve_gemv_fp32_mla_8VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
-#endif // __ARM_FEATURE_SVE
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp
new file mode 100644
index 0000000000..c62e31936c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp
@@ -0,0 +1,1372 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_gemv_fp32_mla_8VL (
+    const float *A_ptr, const float *B_ptr, float *output_ptr,
+    size_t N, size_t K,
+    const float *bias, Activation act, bool
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "cntw x24\n"
+      "add x23, %x[N], x24\n"
+      "sub x23, x23, #0x1\n"
+      "udiv x23, x23, x24\n"
+      "mov x22, %x[bias]\n"
+      "1:"  // Column loop
+      "cmp x23, #0x8\n"
+      "bge 50f\n"
+      "cmp x23, #0x6\n"
+      "bgt 43f\n"
+      "beq 36f\n"
+      "cmp x23, #0x4\n"
+      "bgt 29f\n"
+      "beq 22f\n"
+      "cmp x23, #0x2\n"
+      "bgt 15f\n"
+      "beq 8f\n"
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "whilelt p1.s, XZR, %x[N]\n"
+      "cbz x22, 2f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "addvl x22, x22, #1\n"
+      "b 3f\n"
+      "2:"  // Width 1: no bias
+      "mov z24.b, #0x0\n"
+      "3:"  // Width 1: setup done
+      "cmp x21, #0x4\n"
+      "ble 5f\n"
+      "4:"  // Width 1: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x20, x20, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z2.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "cmp x21, #0x4\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z3.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z4.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 4b\n"
+      "5:"  // Width 1: Multiply loop: Single iteration only
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z5.s, z0.s[0]\n"
+      "add x20, x20, #0x10\n"
+      "subs x21, x21, #0x1\n"
+      "ble 6f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z6.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "ble 6f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z7.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "ble 6f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z8.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "6:"  // Width 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 7f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "7:"  // Width 1: No activation
+      "st1w { z24.s }, p1, [%x[output_ptr]]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #1\n"
+      "b 57f\n"
+      "8:"  // Width 2
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "sub x19, %x[N], x24\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 9f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "addvl x22, x22, #2\n"
+      "b 10f\n"
+      "9:"  // Width 2: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "10:"  // Width 2: setup done
+      "cmp x21, #0x4\n"
+      "ble 12f\n"
+      "11:"  // Width 2: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z3.s, z0.s[1]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z25.s, z4.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "cmp x21, #0x4\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z5.s, z0.s[2]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z25.s, z6.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z7.s, z0.s[3]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z25.s, z8.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 11b\n"
+      "12:"  // Width 2: Multiply loop: Single iteration only
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z9.s, z0.s[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z10.s, z0.s[0]\n"
+      "subs x21, x21, #0x1\n"
+      "ble 13f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z11.s, z0.s[1]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z25.s, z12.s, z0.s[1]\n"
+      "ble 13f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z13.s, z0.s[2]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z25.s, z14.s, z0.s[2]\n"
+      "ble 13f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z25.s, z16.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "13:"  // Width 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 14f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "14:"  // Width 2: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "st1w { z25.s }, p1, [%x[output_ptr], #1, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #2\n"
+      "b 57f\n"
+      "15:"  // Width 3
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "mov x19, #0x2\n"
+      "msub x19, x24, x19, %x[N]\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 16f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "addvl x22, x22, #3\n"
+      "b 17f\n"
+      "16:"  // Width 3: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "17:"  // Width 3: setup done
+      "cmp x21, #0x4\n"
+      "ble 19f\n"
+      "18:"  // Width 3: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z26.s, z3.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "cmp x21, #0x4\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z4.s, z0.s[1]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z5.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z26.s, z6.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z7.s, z0.s[2]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z8.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z26.s, z9.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z10.s, z0.s[3]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z11.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z26.s, z12.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 18b\n"
+      "19:"  // Width 3: Multiply loop: Single iteration only
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z13.s, z0.s[0]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z14.s, z0.s[0]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z15.s, z0.s[0]\n"
+      "ble 20f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z17.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z18.s, z0.s[1]\n"
+      "ble 20f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z19.s, z0.s[2]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z20.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z21.s, z0.s[2]\n"
+      "ble 20f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z22.s, z0.s[3]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z23.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z26.s, z1.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "20:"  // Width 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 21f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "21:"  // Width 3: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [%x[output_ptr], #2, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #3\n"
+      "b 57f\n"
+      "22:"  // Width 4
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "mov x19, #0x3\n"
+      "msub x19, x24, x19, %x[N]\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 23f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+      "addvl x22, x22, #4\n"
+      "b 24f\n"
+      "23:"  // Width 4: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "24:"  // Width 4: setup done
+      "cmp x21, #0x4\n"
+      "ble 26f\n"
+      "25:"  // Width 4: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z3.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "cmp x21, #0x4\n"
+      "fmla z27.s, z4.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z5.s, z0.s[1]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z6.s, z0.s[1]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z7.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z27.s, z8.s, z0.s[1]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z9.s, z0.s[2]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z10.s, z0.s[2]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z11.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z27.s, z12.s, z0.s[2]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z13.s, z0.s[3]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z14.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z15.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z27.s, z16.s, z0.s[3]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 25b\n"
+      "26:"  // Width 4: Multiply loop: Single iteration only
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z17.s, z0.s[0]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z18.s, z0.s[0]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z19.s, z0.s[0]\n"
+      "fmla z27.s, z20.s, z0.s[0]\n"
+      "ble 27f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z21.s, z0.s[1]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z22.s, z0.s[1]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z23.s, z0.s[1]\n"
+      "fmla z27.s, z1.s, z0.s[1]\n"
+      "ble 27f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z2.s, z0.s[2]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z3.s, z0.s[2]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z4.s, z0.s[2]\n"
+      "fmla z27.s, z5.s, z0.s[2]\n"
+      "ble 27f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z7.s, z0.s[3]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z8.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z27.s, z9.s, z0.s[3]\n"
+      "27:"  // Width 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 28f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "28:"  // Width 4: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [%x[output_ptr], #3, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #4\n"
+      "b 57f\n"
+      "29:"  // Width 5
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "mov x19, #0x4\n"
+      "msub x19, x24, x19, %x[N]\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 30f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "addvl x22, x22, #5\n"
+      "b 31f\n"
+      "30:"  // Width 5: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "31:"  // Width 5: setup done
+      "cmp x21, #0x4\n"
+      "ble 33f\n"
+      "32:"  // Width 5: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "cmp x21, #0x4\n"
+      "fmla z26.s, z3.s, z0.s[0]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z4.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z28.s, z5.s, z0.s[0]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z6.s, z0.s[1]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z7.s, z0.s[1]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z8.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z27.s, z9.s, z0.s[1]\n"
+      "fmla z28.s, z10.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z11.s, z0.s[2]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z12.s, z0.s[2]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z13.s, z0.s[2]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z14.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z28.s, z15.s, z0.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z17.s, z0.s[3]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z18.s, z0.s[3]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z19.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z28.s, z20.s, z0.s[3]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 32b\n"
+      "33:"  // Width 5: Multiply loop: Single iteration only
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z21.s, z0.s[0]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z22.s, z0.s[0]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z23.s, z0.s[0]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z1.s, z0.s[0]\n"
+      "fmla z28.s, z2.s, z0.s[0]\n"
+      "ble 34f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z3.s, z0.s[1]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z4.s, z0.s[1]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z5.s, z0.s[1]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z6.s, z0.s[1]\n"
+      "fmla z28.s, z7.s, z0.s[1]\n"
+      "ble 34f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z8.s, z0.s[2]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z9.s, z0.s[2]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z10.s, z0.s[2]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z11.s, z0.s[2]\n"
+      "fmla z28.s, z12.s, z0.s[2]\n"
+      "ble 34f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z13.s, z0.s[3]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z14.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z15.s, z0.s[3]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z16.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z28.s, z17.s, z0.s[3]\n"
+      "34:"  // Width 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 35f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "35:"  // Width 5: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+      "st1w { z28.s }, p1, [%x[output_ptr], #4, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #5\n"
+      "b 57f\n"
+      "36:"  // Width 6
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "mov x19, #0x5\n"
+      "msub x19, x24, x19, %x[N]\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 37f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
+      "addvl x22, x22, #6\n"
+      "b 38f\n"
+      "37:"  // Width 6: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "38:"  // Width 6: setup done
+      "cmp x21, #0x4\n"
+      "ble 40f\n"
+      "39:"  // Width 6: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "cmp x21, #0x4\n"
+      "fmla z26.s, z3.s, z0.s[0]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z4.s, z0.s[0]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z5.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z29.s, z6.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z7.s, z0.s[1]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z8.s, z0.s[1]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z9.s, z0.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z10.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z28.s, z11.s, z0.s[1]\n"
+      "fmla z29.s, z12.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z13.s, z0.s[2]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z14.s, z0.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z15.s, z0.s[2]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z16.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z28.s, z17.s, z0.s[2]\n"
+      "fmla z29.s, z18.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z19.s, z0.s[3]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z20.s, z0.s[3]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z21.s, z0.s[3]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z22.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z28.s, z23.s, z0.s[3]\n"
+      "fmla z29.s, z1.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 39b\n"
+      "40:"  // Width 6: Multiply loop: Single iteration only
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z2.s, z0.s[0]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z3.s, z0.s[0]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z4.s, z0.s[0]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z5.s, z0.s[0]\n"
+      "fmla z28.s, z6.s, z0.s[0]\n"
+      "fmla z29.s, z7.s, z0.s[0]\n"
+      "ble 41f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z8.s, z0.s[1]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z9.s, z0.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z10.s, z0.s[1]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z11.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z12.s, z0.s[1]\n"
+      "fmla z29.s, z13.s, z0.s[1]\n"
+      "ble 41f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z15.s, z0.s[2]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z16.s, z0.s[2]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z17.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z18.s, z0.s[2]\n"
+      "fmla z29.s, z19.s, z0.s[2]\n"
+      "ble 41f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z20.s, z0.s[3]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z21.s, z0.s[3]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z22.s, z0.s[3]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z23.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z28.s, z1.s, z0.s[3]\n"
+      "fmla z29.s, z2.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "41:"  // Width 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 42f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z17.s\n"
+      "42:"  // Width 6: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+      "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
+      "st1w { z29.s }, p1, [%x[output_ptr], #5, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #6\n"
+      "b 57f\n"
+      "43:"  // Width 7
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "mov x19, #0x6\n"
+      "msub x19, x24, x19, %x[N]\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 44f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
+      "addvl x22, x22, #7\n"
+      "b 45f\n"
+      "44:"  // Width 7: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "45:"  // Width 7: setup done
+      "cmp x21, #0x4\n"
+      "ble 47f\n"
+      "46:"  // Width 7: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "cmp x21, #0x4\n"
+      "fmla z26.s, z3.s, z0.s[0]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z4.s, z0.s[0]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z28.s, z5.s, z0.s[0]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z29.s, z6.s, z0.s[0]\n"
+      "fmla z30.s, z7.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z8.s, z0.s[1]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z9.s, z0.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z10.s, z0.s[1]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z11.s, z0.s[1]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z12.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z29.s, z13.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z30.s, z14.s, z0.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z15.s, z0.s[2]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z25.s, z16.s, z0.s[2]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z17.s, z0.s[2]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z18.s, z0.s[2]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z19.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z29.s, z20.s, z0.s[2]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z30.s, z21.s, z0.s[2]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z22.s, z0.s[3]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z25.s, z23.s, z0.s[3]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z1.s, z0.s[3]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z2.s, z0.s[3]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z3.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z29.s, z4.s, z0.s[3]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla z30.s, z5.s, z0.s[3]\n"
+      "bgt 46b\n"
+      "47:"  // Width 7: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z6.s, z0.s[0]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z7.s, z0.s[0]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z8.s, z0.s[0]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z27.s, z9.s, z0.s[0]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z10.s, z0.s[0]\n"
+      "fmla z29.s, z11.s, z0.s[0]\n"
+      "fmla z30.s, z12.s, z0.s[0]\n"
+      "ble 48f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z14.s, z0.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z15.s, z0.s[1]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z16.s, z0.s[1]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z17.s, z0.s[1]\n"
+      "fmla z29.s, z18.s, z0.s[1]\n"
+      "fmla z30.s, z19.s, z0.s[1]\n"
+      "ble 48f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z20.s, z0.s[2]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z21.s, z0.s[2]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z22.s, z0.s[2]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z23.s, z0.s[2]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z1.s, z0.s[2]\n"
+      "fmla z29.s, z2.s, z0.s[2]\n"
+      "fmla z30.s, z3.s, z0.s[2]\n"
+      "ble 48f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z4.s, z0.s[3]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z5.s, z0.s[3]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z6.s, z0.s[3]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z7.s, z0.s[3]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z8.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z29.s, z9.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z30.s, z10.s, z0.s[3]\n"
+      "48:"  // Width 7: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 49f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z16.s\n"
+      "fmin z30.s, p2/M, z30.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z17.s\n"
+      "fmax z30.s, p2/M, z30.s, z17.s\n"
+      "49:"  // Width 7: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+      "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
+      "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
+      "st1w { z30.s }, p1, [%x[output_ptr], #6, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #7\n"
+      "b 57f\n"
+      "50:"  // Width 8
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "mov x19, #0x7\n"
+      "msub x19, x24, x19, %x[N]\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 51f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
+      "ld1w { z31.s }, p2/Z, [x22, #7, MUL VL]\n"
+      "addvl x22, x22, #8\n"
+      "b 52f\n"
+      "51:"  // Width 8: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "52:"  // Width 8: setup done
+      "cmp x21, #0x4\n"
+      "ble 54f\n"
+      "53:"  // Width 8: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "cmp x21, #0x4\n"
+      "fmla z26.s, z3.s, z0.s[0]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z4.s, z0.s[0]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z28.s, z5.s, z0.s[0]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z6.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z30.s, z7.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z31.s, z8.s, z0.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z25.s, z10.s, z0.s[1]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z11.s, z0.s[1]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z12.s, z0.s[1]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z28.s, z13.s, z0.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z14.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z30.s, z15.s, z0.s[1]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z31.s, z16.s, z0.s[1]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z17.s, z0.s[2]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z25.s, z18.s, z0.s[2]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z19.s, z0.s[2]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z20.s, z0.s[2]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z28.s, z21.s, z0.s[2]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z22.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z30.s, z23.s, z0.s[2]\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z31.s, z1.s, z0.s[2]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z2.s, z0.s[3]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z25.s, z3.s, z0.s[3]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z4.s, z0.s[3]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z5.s, z0.s[3]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z28.s, z6.s, z0.s[3]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z7.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z30.s, z8.s, z0.s[3]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla z31.s, z9.s, z0.s[3]\n"
+      "bgt 53b\n"
+      "54:"  // Width 8: Multiply loop: Single iteration only
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z10.s, z0.s[0]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z11.s, z0.s[0]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z12.s, z0.s[0]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z27.s, z13.s, z0.s[0]\n"
+      "fmla z28.s, z14.s, z0.s[0]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z15.s, z0.s[0]\n"
+      "fmla z30.s, z16.s, z0.s[0]\n"
+      "fmla z31.s, z17.s, z0.s[0]\n"
+      "ble 55f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z18.s, z0.s[1]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z19.s, z0.s[1]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z20.s, z0.s[1]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z21.s, z0.s[1]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "fmla z28.s, z22.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z23.s, z0.s[1]\n"
+      "fmla z30.s, z1.s, z0.s[1]\n"
+      "fmla z31.s, z2.s, z0.s[1]\n"
+      "ble 55f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z3.s, z0.s[2]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z4.s, z0.s[2]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z5.s, z0.s[2]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z6.s, z0.s[2]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "fmla z28.s, z7.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z8.s, z0.s[2]\n"
+      "fmla z30.s, z9.s, z0.s[2]\n"
+      "fmla z31.s, z10.s, z0.s[2]\n"
+      "ble 55f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z12.s, z0.s[3]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z13.s, z0.s[3]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z14.s, z0.s[3]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "fmla z28.s, z15.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z16.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z30.s, z17.s, z0.s[3]\n"
+      "fmla z31.s, z18.s, z0.s[3]\n"
+      "55:"  // Width 8: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 56f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z16.s\n"
+      "fmin z30.s, p2/M, z30.s, z16.s\n"
+      "fmin z31.s, p2/M, z31.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z17.s\n"
+      "fmax z30.s, p2/M, z30.s, z17.s\n"
+      "fmax z31.s, p2/M, z31.s, z17.s\n"
+      "56:"  // Width 8: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "subs x23, x23, #0x8\n"
+      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+      "sub %x[N], %x[N], x24, LSL #3\n"
+      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+      "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
+      "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
+      "st1w { z30.s }, p2, [%x[output_ptr], #6, MUL VL]\n"
+      "st1w { z31.s }, p1, [%x[output_ptr], #7, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #8\n"
+      "bgt 1b\n"
+      "57:"  // Exit
+
+      : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
+      : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
+      : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp
deleted file mode 100644
index 385a16fe10..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2247 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = ((K + 1) / 2) * 2;
-    const long loops_count = ((K + 8) / 16) - 1;
-    K -= loops_count * 16;
-    const long regs_count = (K / 8) - 1;
-    K -= (regs_count + 1) * 8;
-    const long leftovers = K;
-    const long blocks_count = (K + 1) / 2;
-    float nullbias[256];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const bfloat16 * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(bfloat16);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const bfloat16 *a_ptr0 = a_ptr0_base;
-            const bfloat16 *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z28.d, z16.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z29.d, z17.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z30.d, z18.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z31.d, z19.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1w z28.s, p0/z, [c_ptr3]\n"
-                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z28.s, p0, [c_ptr3]\n"
-                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
index eba98bb74d..e344d82dc6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,42 +10,49 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __ARM_FEATURE_SVE
 
-#include "../bfloat.hpp"
 #include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<bfloat16>, \
+   size_t, size_t, \
+   const bfloat16 *, \
+   IndirectOutputArg<float>, \
+   const float *, Activation, bool
 
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_hybrid_bf16fp32_dot_6x4VL( ARGLIST );
 
-class hybrid_bf16fp32_dot_4VLx4
+class cls_sve_hybrid_bf16fp32_dot_6x4VL
 {
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
     {
-        return 4;
+        return 6;
     }
 
     static unsigned int out_width()
@@ -63,27 +70,17 @@ public:
         return true;
     }
 
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 4, 4, 2> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 6, 4, 2> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_hybrid_bf16fp32_dot_4VLx4;
+    kern_type kernel=sve_hybrid_bf16fp32_dot_6x4VL;
 
-    hybrid_bf16fp32_dot_4VLx4(const CPUInfo *)
+    cls_sve_hybrid_bf16fp32_dot_6x4VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..19385e56ea
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
@@ -0,0 +1,2237 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_bf16fp32_dot_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 71f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 57f\n"
+      "beq 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 4f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 9f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "cmp x11, #0x8\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "cmp x11, #0x8\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      "12:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 7b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 13f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "13:"  // Height 1: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "14:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 3b\n"
+      "b 86f\n"
+      "15:"  // Height 2
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 16f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "17:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 18f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "b 20f\n"
+      "18:"  // Height 2: no bias
+      "tbz %x[flags], #0, 19f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "b 20f\n"
+      "19:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "20:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "21:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 22f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 23f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "b 23f\n"
+      "22:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "23:"  // Height 2: input setup done
+      "cmp x11, #0x8\n"
+      "ble 25f\n"
+      "24:"  // Height 2: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "cmp x11, #0x8\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      "bgt 24b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 21b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "27:"  // Height 2: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "28:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 17b\n"
+      "b 86f\n"
+      "29:"  // Height 3
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 30f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 31f\n"
+      "30:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "31:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 32f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "mov z13.d, z9.d\n"
+      "addvl x14, x14, #4\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 34f\n"
+      "32:"  // Height 3: no bias
+      "tbz %x[flags], #0, 33f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "b 34f\n"
+      "33:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "37:"  // Height 3: input setup done
+      "cmp x11, #0x8\n"
+      "ble 39f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "cmp x11, #0x8\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      "bgt 38b\n"
+      "39:"  // Height 3: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      "40:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 35b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 41f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "41:"  // Height 3: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "42:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 31b\n"
+      "b 86f\n"
+      "43:"  // Height 4
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 44f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 45f\n"
+      "44:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "45:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 46f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 48f\n"
+      "46:"  // Height 4: no bias
+      "tbz %x[flags], #0, 47f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "b 48f\n"
+      "47:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "48:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "49:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 51f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "b 51f\n"
+      "50:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "51:"  // Height 4: input setup done
+      "cmp x11, #0x8\n"
+      "ble 53f\n"
+      "52:"  // Height 4: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x8\n"
+      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      "bgt 52b\n"
+      "53:"  // Height 4: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      "54:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 49b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 55f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z23.s, p5/M, z23.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "55:"  // Height 4: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "56:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 45b\n"
+      "b 86f\n"
+      "57:"  // Height 5
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 58f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 59f\n"
+      "58:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "59:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 60f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 62f\n"
+      "60:"  // Height 5: no bias
+      "tbz %x[flags], #0, 61f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "b 62f\n"
+      "61:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "62:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "63:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 64f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 65f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "b 65f\n"
+      "64:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "65:"  // Height 5: input setup done
+      "cmp x11, #0x8\n"
+      "ble 67f\n"
+      "66:"  // Height 5: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x8\n"
+      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
+      "bgt 66b\n"
+      "67:"  // Height 5: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
+      "68:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 63b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 69f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z23.s, p5/M, z23.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "fmax z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z1.s\n"
+      "fmax z26.s, p5/M, z26.s, z1.s\n"
+      "fmax z27.s, p5/M, z27.s, z1.s\n"
+      "69:"  // Height 5: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "70:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 59b\n"
+      "b 86f\n"
+      "71:"  // Height 6
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "73:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 74f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 76f\n"
+      "74:"  // Height 6: no bias
+      "tbz %x[flags], #0, 75f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x21]\n"
+      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "b 76f\n"
+      "75:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "76:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "77:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 78f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 79f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x20, x20, x19, LSL #1\n"
+      "b 79f\n"
+      "78:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "add x20, x22, x19, LSL #1\n"
+      "79:"  // Height 6: input setup done
+      "cmp x11, #0x8\n"
+      "ble 81f\n"
+      "80:"  // Height 6: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "ld1rqh { z5.h }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x8\n"
+      ".inst 0x646540dc  // bfdot z28.s, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x646540fd  // bfdot z29.s, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
+      ".inst 0x646540de  // bfdot z30.s, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
+      ".inst 0x646540ff  // bfdot z31.s, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
+      ".inst 0x646d40dc  // bfdot z28.s, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
+      ".inst 0x646d40fd  // bfdot z29.s, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
+      ".inst 0x646d40de  // bfdot z30.s, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
+      ".inst 0x646d40ff  // bfdot z31.s, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
+      ".inst 0x647540dc  // bfdot z28.s, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
+      ".inst 0x647540fd  // bfdot z29.s, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
+      ".inst 0x647540de  // bfdot z30.s, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
+      ".inst 0x647540ff  // bfdot z31.s, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
+      ".inst 0x647d40dc  // bfdot z28.s, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
+      ".inst 0x647d40fd  // bfdot z29.s, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
+      ".inst 0x647d40de  // bfdot z30.s, z6.h, z5.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
+      ".inst 0x647d40ff  // bfdot z31.s, z7.h, z5.h[3]\n"
+      "bgt 80b\n"
+      "81:"  // Height 6: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "ld1rqh { z5.h }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
+      ".inst 0x646540dc  // bfdot z28.s, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
+      ".inst 0x646540fd  // bfdot z29.s, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
+      ".inst 0x646540de  // bfdot z30.s, z6.h, z5.h[0]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
+      ".inst 0x646540ff  // bfdot z31.s, z7.h, z5.h[0]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
+      ".inst 0x646d40dc  // bfdot z28.s, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
+      ".inst 0x646d40fd  // bfdot z29.s, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
+      ".inst 0x646d40de  // bfdot z30.s, z6.h, z5.h[1]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
+      ".inst 0x646d40ff  // bfdot z31.s, z7.h, z5.h[1]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
+      ".inst 0x647540dc  // bfdot z28.s, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
+      ".inst 0x647540fd  // bfdot z29.s, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
+      ".inst 0x647540de  // bfdot z30.s, z6.h, z5.h[2]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
+      ".inst 0x647540ff  // bfdot z31.s, z7.h, z5.h[2]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
+      ".inst 0x647d40dc  // bfdot z28.s, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
+      ".inst 0x647d40fd  // bfdot z29.s, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
+      ".inst 0x647d40de  // bfdot z30.s, z6.h, z5.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
+      ".inst 0x647d40ff  // bfdot z31.s, z7.h, z5.h[3]\n"
+      "82:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 77b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 83f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z23.s, p5/M, z23.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "fmax z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmin z28.s, p5/M, z28.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z1.s\n"
+      "fmax z26.s, p5/M, z26.s, z1.s\n"
+      "fmax z27.s, p5/M, z27.s, z1.s\n"
+      "fmax z28.s, p5/M, z28.s, z1.s\n"
+      "fmin z29.s, p5/M, z29.s, z0.s\n"
+      "fmin z30.s, p5/M, z30.s, z0.s\n"
+      "fmin z31.s, p5/M, z31.s, z0.s\n"
+      "fmax z29.s, p5/M, z29.s, z1.s\n"
+      "fmax z30.s, p5/M, z30.s, z1.s\n"
+      "fmax z31.s, p5/M, z31.s, z1.s\n"
+      "83:"  // Height 6: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "st1w { z28.s }, p4, [x21]\n"
+      "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+      "addvl x21, x21, #4\n"
+      "84:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 73b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 86f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "85:"  // Update direct input
+      "mov x19, #0xc\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "86:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp
deleted file mode 100644
index 641e5c12fd..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-class hybrid_bf16fp32_mmla_4VLx4
-{
-public:
-    typedef bfloat16 operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 8;
-    }
-
-    static unsigned int out_width()
-    {
-        return get_vector_length<float>() * 2;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 4;
-    }
-
-    static constexpr bool supports_accumulate()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
-
-    // Default to the generic kernel
-    kern_type kernel=sve_hybrid_bf16fp32_mmla_4VLx4;
-
-    hybrid_bf16fp32_mmla_4VLx4(const CPUInfo *)
-    {
-
-    }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp
deleted file mode 100644
index 76e3546c6f..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,3459 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 8) / 16) - 1;
-    K -= loops_count * 16;
-    const long regs_count = (K / 8) - 1;
-    K -= (regs_count + 1) * 8;
-    const long leftovers = K;
-    const long blocks_count = (K + 3) / 4;
-    float nullbias[128];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (2 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const bfloat16 * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(bfloat16);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 8) {
-            if (rows_to_compute % 8) {
-                rows_to_compute = 8 - 1;
-            } else {
-                rows_to_compute = 8;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const bfloat16 *a_ptr0 = a_ptr0_base;
-            const bfloat16 *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z1.h, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z14.s, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "mov z1.h, #0\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "mov z1.h, #0\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "mov z1.h, #0\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp1 z1.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z3.h, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z20.d, z16.d\n"
-                        "mov z21.d, z17.d\n"
-                        "mov z22.d, z18.d\n"
-                        "mov z23.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z3.h, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z7.h, #0\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z7.h, #0\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z7.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp1 z5.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z21.d, z17.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z22.d, z18.d\n"
-                        "mov z23.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-                case 5:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "c_ptr1 .req X4\n"
-                        "c_ptr2 .req X5\n"
-                        "c_ptr3 .req X6\n"
-                        "c_ptr4 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z5.h, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z21.d, z17.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z23.d, z19.d\n"
-                        "mov z24.d, z16.d\n"
-                        "mov z25.d, z17.d\n"
-                        "mov z26.d, z18.d\n"
-                        "mov z27.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z5.h, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z9.h, #0\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z9.h, #0\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p6/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z9.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp1 z9.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
-                    );
-                    break;
-                case 6:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "c_ptr1 .req X5\n"
-                        "c_ptr2 .req X6\n"
-                        "c_ptr3 .req X7\n"
-                        "c_ptr4 .req X8\n"
-                        "c_ptr5 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z21.d, z17.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "mov z24.d, z16.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z25.d, z17.d\n"
-                        "mov z26.d, z18.d\n"
-                        "mov z27.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "ld1w z14.s, p0/z, [c_ptr5]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p7/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p7/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl a_ptr5, a_ptr5, #2\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p6/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p6/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "addvl a_ptr5, a_ptr5, #1\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "st1w z9.s, p0, [c_ptr5]\n"
-                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
-                    );
-                    break;
-                case 7:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "a_ptr6 .req X5\n"
-                        "c_ptr1 .req X6\n"
-                        "c_ptr2 .req X7\n"
-                        "c_ptr3 .req X8\n"
-                        "c_ptr4 .req X9\n"
-                        "c_ptr5 .req X10\n"
-                        "c_ptr6 .req X11\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add a_ptr6, a_ptr5, %[lda]\n"
-                        "add c_ptr6, c_ptr5, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z7.h, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr6]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z21.d, z17.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "mov z24.d, z16.d\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "mov z25.d, z17.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z26.d, z18.d\n"
-                        "mov z27.d, z19.d\n"
-                        "mov z28.d, z16.d\n"
-                        "mov z29.d, z17.d\n"
-                        "mov z30.d, z18.d\n"
-                        "mov z31.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z7.h, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr6]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "ld1w z14.s, p0/z, [c_ptr5]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr6]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z28.s, z13.s, z14.s\n"
-                        "zip2 z29.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z30.s, z13.s, z14.s\n"
-                        "zip2 z31.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p7/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1rqh z10.h, p7/z, [a_ptr6]\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z11.h, #0\n"
-                        "add a_ptr6, a_ptr6, #0x20\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr6, #-0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z7.h, #0\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p7/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1rqh z10.h, p7/z, [a_ptr6]\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z11.h, #0\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr6, #0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl a_ptr5, a_ptr5, #2\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "addvl a_ptr6, a_ptr6, #2\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z7.h, #0\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p6/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p6/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "addvl a_ptr5, a_ptr5, #1\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1rqh z10.h, p6/z, [a_ptr6]\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z11.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "addvl a_ptr6, a_ptr6, #1\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "st1w z9.s, p0, [c_ptr5]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "uzp1 z12.s, z28.s, z29.s\n"
-                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
-                        "uzp1 z13.s, z30.s, z31.s\n"
-                        "st1w z12.s, p0, [c_ptr6]\n"
-                        "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq a_ptr6\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        ".unreq c_ptr6\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 8:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "a_ptr6 .req X5\n"
-                        "a_ptr7 .req X6\n"
-                        "c_ptr1 .req X7\n"
-                        "c_ptr2 .req X8\n"
-                        "c_ptr3 .req X9\n"
-                        "c_ptr4 .req X10\n"
-                        "c_ptr5 .req X11\n"
-                        "c_ptr6 .req X12\n"
-                        "c_ptr7 .req X13\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add a_ptr6, a_ptr5, %[lda]\n"
-                        "add c_ptr6, c_ptr5, %[ldc]\n"
-                        "add a_ptr7, a_ptr6, %[lda]\n"
-                        "add c_ptr7, c_ptr6, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr6]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr7]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z24.d, z16.d\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "mov z25.d, z17.d\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "mov z26.d, z18.d\n"
-                        "add a_ptr7, a_ptr7, #0x10\n"
-                        "mov z27.d, z19.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z28.d, z16.d\n"
-                        "mov z29.d, z17.d\n"
-                        "mov z30.d, z18.d\n"
-                        "mov z31.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr6]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr7]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "add a_ptr7, a_ptr7, #0x10\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "ld1w z14.s, p0/z, [c_ptr5]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr6]\n"
-                        "ld1w z14.s, p0/z, [c_ptr7]\n"
-                        "zip1 z28.s, z13.s, z14.s\n"
-                        "zip2 z29.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n"
-                        "zip1 z30.s, z13.s, z14.s\n"
-                        "zip2 z31.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p7/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1rqh z10.h, p7/z, [a_ptr6]\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z11.h, p7/z, [a_ptr7]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "add a_ptr6, a_ptr6, #0x20\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "add a_ptr7, a_ptr7, #0x20\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr6, #-0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr7, #-0x10]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p7/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1rqh z10.h, p7/z, [a_ptr6]\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z11.h, p7/z, [a_ptr7]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr6, #0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr7, #0x10]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "addvl a_ptr5, a_ptr5, #2\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "addvl a_ptr6, a_ptr6, #2\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "addvl a_ptr7, a_ptr7, #2\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p6/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p6/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "addvl a_ptr5, a_ptr5, #1\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1rqh z10.h, p6/z, [a_ptr6]\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z11.h, p6/z, [a_ptr7]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl a_ptr6, a_ptr6, #1\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "addvl a_ptr7, a_ptr7, #1\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "st1w z9.s, p0, [c_ptr5]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "uzp1 z12.s, z28.s, z29.s\n"
-                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
-                        "uzp2 z13.s, z28.s, z29.s\n"
-                        "uzp1 z14.s, z30.s, z31.s\n"
-                        "uzp2 z15.s, z30.s, z31.s\n"
-                        "st1w z12.s, p0, [c_ptr6]\n"
-                        "st1w z13.s, p0, [c_ptr7]\n"
-                        "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n"
-                        "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq a_ptr6\n"
-                        ".unreq a_ptr7\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        ".unreq c_ptr6\n"
-                        ".unreq c_ptr7\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp
deleted file mode 100644
index bd457e9d27..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-class hybrid_bf16fp32_mmla_6VLx2
-{
-public:
-    typedef bfloat16 operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 4;
-    }
-
-    static unsigned int out_width()
-    {
-        return get_vector_length<float>() * 3;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 4;
-    }
-
-    static constexpr bool supports_accumulate()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 2, 6, 4> transforms = {};
-
-    // Default to the generic kernel
-    kern_type kernel=sve_hybrid_bf16fp32_mmla_6VLx2;
-
-    hybrid_bf16fp32_mmla_6VLx2(const CPUInfo *)
-    {
-
-    }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp
deleted file mode 100644
index 59dc6dc540..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp
+++ /dev/null
@@ -1,1633 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 8) / 16) - 1;
-    K -= loops_count * 16;
-    const long regs_count = (K / 8) - 1;
-    K -= (regs_count + 1) * 8;
-    const long leftovers = K;
-    const long blocks_count = (K + 3) / 4;
-    float nullbias[192];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (3 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const bfloat16 * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(bfloat16);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(3 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (3 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const bfloat16 *a_ptr0 = a_ptr0_base;
-            const bfloat16 *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z1.h, #0\n"
-                        "ld1w z19.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "zip1 z20.s, z19.s, z19.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z21.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip1 z22.s, z19.s, z19.s\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip2 z23.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip1 z24.s, z19.s, z19.s\n"
-                        "zip2 z25.s, z19.s, z19.s\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z18.s, #0\n"
-                        "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
-                        "mov z1.h, #0\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z20.s, z17.s, z18.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z21.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z18.s, #0\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip1 z22.s, z17.s, z18.s\n"
-                        "zip2 z23.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "mov z18.s, #0\n"
-                        "zip1 z24.s, z17.s, z18.s\n"
-                        "zip2 z25.s, z17.s, z18.s\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "mov z1.h, #0\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z1.h, #0\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #12\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "5:\n"
-                        "ld1rw z18.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z19.s, p7/z, [%[maxptr]]\n"
-                        "fmax z20.s, p7/m, z20.s, z18.s\n"
-                        "fmax z21.s, p7/m, z21.s, z18.s\n"
-                        "fmax z22.s, p7/m, z22.s, z18.s\n"
-                        "fmax z23.s, p7/m, z23.s, z18.s\n"
-                        "fmin z20.s, p7/m, z20.s, z19.s\n"
-                        "fmin z21.s, p7/m, z21.s, z19.s\n"
-                        "fmin z22.s, p7/m, z22.s, z19.s\n"
-                        "fmin z23.s, p7/m, z23.s, z19.s\n"
-                        "fmax z24.s, p7/m, z24.s, z18.s\n"
-                        "uzp1 z0.s, z20.s, z21.s\n"
-                        "fmax z25.s, p7/m, z25.s, z18.s\n"
-                        "uzp1 z1.s, z22.s, z23.s\n"
-                        "fmin z24.s, p7/m, z24.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z25.s, p7/m, z25.s, z19.s\n"
-                        "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "uzp1 z2.s, z24.s, z25.s\n"
-                        "st1w z2.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z19.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z20.s, z19.s, z19.s\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z21.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z22.s, z19.s, z19.s\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip2 z23.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip1 z24.s, z19.s, z19.s\n"
-                        "zip2 z25.s, z19.s, z19.s\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z18.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z20.s, z17.s, z18.s\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z21.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z22.s, z17.s, z18.s\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip2 z23.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip1 z24.s, z17.s, z18.s\n"
-                        "zip2 z25.s, z17.s, z18.s\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #12\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "5:\n"
-                        "ld1rw z18.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z19.s, p7/z, [%[maxptr]]\n"
-                        "fmax z20.s, p7/m, z20.s, z18.s\n"
-                        "fmax z21.s, p7/m, z21.s, z18.s\n"
-                        "fmax z22.s, p7/m, z22.s, z18.s\n"
-                        "fmax z23.s, p7/m, z23.s, z18.s\n"
-                        "fmin z20.s, p7/m, z20.s, z19.s\n"
-                        "fmin z21.s, p7/m, z21.s, z19.s\n"
-                        "fmin z22.s, p7/m, z22.s, z19.s\n"
-                        "fmin z23.s, p7/m, z23.s, z19.s\n"
-                        "fmax z24.s, p7/m, z24.s, z18.s\n"
-                        "uzp1 z0.s, z20.s, z21.s\n"
-                        "uzp2 z1.s, z20.s, z21.s\n"
-                        "uzp1 z2.s, z22.s, z23.s\n"
-                        "uzp2 z3.s, z22.s, z23.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z24.s, p7/m, z24.s, z19.s\n"
-                        "fmax z25.s, p7/m, z25.s, z18.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmin z25.s, p7/m, z25.s, z19.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "uzp1 z4.s, z24.s, z25.s\n"
-                        "uzp2 z5.s, z24.s, z25.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #3\n"
-                        "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z3.h, #0\n"
-                        "ld1w z19.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z20.s, z19.s, z19.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z21.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z22.s, z19.s, z19.s\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip2 z23.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z26.d, z20.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z27.d, z21.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip1 z24.s, z19.s, z19.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip2 z25.s, z19.s, z19.s\n"
-                        "mov z28.d, z22.d\n"
-                        "mov z29.d, z23.d\n"
-                        "mov z30.d, z24.d\n"
-                        "mov z31.d, z25.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z3.h, #0\n"
-                        "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z18.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z20.s, z17.s, z18.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z21.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z22.s, z17.s, z18.s\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip2 z23.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip1 z24.s, z17.s, z18.s\n"
-                        "zip2 z25.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p0/z, [c_ptr2]\n"
-                        "mov z18.s, #0\n"
-                        "zip1 z26.s, z17.s, z18.s\n"
-                        "zip2 z27.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "mov z18.s, #0\n"
-                        "zip1 z28.s, z17.s, z18.s\n"
-                        "zip2 z29.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "mov z18.s, #0\n"
-                        "zip1 z30.s, z17.s, z18.s\n"
-                        "zip2 z31.s, z17.s, z18.s\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #12\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "5:\n"
-                        "ld1rw z18.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z19.s, p7/z, [%[maxptr]]\n"
-                        "fmax z20.s, p7/m, z20.s, z18.s\n"
-                        "fmax z21.s, p7/m, z21.s, z18.s\n"
-                        "fmax z22.s, p7/m, z22.s, z18.s\n"
-                        "fmax z23.s, p7/m, z23.s, z18.s\n"
-                        "fmin z20.s, p7/m, z20.s, z19.s\n"
-                        "fmin z21.s, p7/m, z21.s, z19.s\n"
-                        "fmin z22.s, p7/m, z22.s, z19.s\n"
-                        "fmin z23.s, p7/m, z23.s, z19.s\n"
-                        "fmax z24.s, p7/m, z24.s, z18.s\n"
-                        "uzp1 z0.s, z20.s, z21.s\n"
-                        "uzp2 z1.s, z20.s, z21.s\n"
-                        "uzp1 z2.s, z22.s, z23.s\n"
-                        "uzp2 z3.s, z22.s, z23.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z24.s, p7/m, z24.s, z19.s\n"
-                        "fmax z25.s, p7/m, z25.s, z18.s\n"
-                        "fmax z26.s, p7/m, z26.s, z18.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z27.s, p7/m, z27.s, z18.s\n"
-                        "fmax z28.s, p7/m, z28.s, z18.s\n"
-                        "fmin z25.s, p7/m, z25.s, z19.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z19.s\n"
-                        "fmin z27.s, p7/m, z27.s, z19.s\n"
-                        "fmin z28.s, p7/m, z28.s, z19.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z24.s, z25.s\n"
-                        "uzp2 z5.s, z24.s, z25.s\n"
-                        "uzp1 z6.s, z26.s, z27.s\n"
-                        "fmax z29.s, p7/m, z29.s, z18.s\n"
-                        "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmax z30.s, p7/m, z30.s, z18.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #3\n"
-                        "fmax z31.s, p7/m, z31.s, z18.s\n"
-                        "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmin z29.s, p7/m, z29.s, z19.s\n"
-                        "fmin z30.s, p7/m, z30.s, z19.s\n"
-                        "fmin z31.s, p7/m, z31.s, z19.s\n"
-                        "st1w z6.s, p0, [c_ptr2]\n"
-                        "uzp1 z7.s, z28.s, z29.s\n"
-                        "uzp1 z8.s, z30.s, z31.s\n"
-                        "st1w z7.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z8.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z19.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z20.s, z19.s, z19.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z21.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z22.s, z19.s, z19.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z23.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z26.d, z20.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z27.d, z21.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip1 z24.s, z19.s, z19.s\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip2 z25.s, z19.s, z19.s\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z28.d, z22.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "mov z29.d, z23.d\n"
-                        "mov z30.d, z24.d\n"
-                        "mov z31.d, z25.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z18.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z20.s, z17.s, z18.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z21.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z22.s, z17.s, z18.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z23.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip1 z24.s, z17.s, z18.s\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip2 z25.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p0/z, [c_ptr2]\n"
-                        "ld1w z18.s, p0/z, [c_ptr3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip1 z26.s, z17.s, z18.s\n"
-                        "zip2 z27.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z18.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z28.s, z17.s, z18.s\n"
-                        "zip2 z29.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "zip1 z30.s, z17.s, z18.s\n"
-                        "zip2 z31.s, z17.s, z18.s\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #12\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr3]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "5:\n"
-                        "ld1rw z18.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z19.s, p7/z, [%[maxptr]]\n"
-                        "fmax z20.s, p7/m, z20.s, z18.s\n"
-                        "fmax z21.s, p7/m, z21.s, z18.s\n"
-                        "fmax z22.s, p7/m, z22.s, z18.s\n"
-                        "fmax z23.s, p7/m, z23.s, z18.s\n"
-                        "fmin z20.s, p7/m, z20.s, z19.s\n"
-                        "fmin z21.s, p7/m, z21.s, z19.s\n"
-                        "fmin z22.s, p7/m, z22.s, z19.s\n"
-                        "fmin z23.s, p7/m, z23.s, z19.s\n"
-                        "fmax z24.s, p7/m, z24.s, z18.s\n"
-                        "uzp1 z0.s, z20.s, z21.s\n"
-                        "uzp2 z1.s, z20.s, z21.s\n"
-                        "uzp1 z2.s, z22.s, z23.s\n"
-                        "uzp2 z3.s, z22.s, z23.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z24.s, p7/m, z24.s, z19.s\n"
-                        "fmax z25.s, p7/m, z25.s, z18.s\n"
-                        "fmax z26.s, p7/m, z26.s, z18.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z27.s, p7/m, z27.s, z18.s\n"
-                        "fmax z28.s, p7/m, z28.s, z18.s\n"
-                        "fmin z25.s, p7/m, z25.s, z19.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z19.s\n"
-                        "fmin z27.s, p7/m, z27.s, z19.s\n"
-                        "fmin z28.s, p7/m, z28.s, z19.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z24.s, z25.s\n"
-                        "uzp2 z5.s, z24.s, z25.s\n"
-                        "uzp1 z6.s, z26.s, z27.s\n"
-                        "uzp2 z7.s, z26.s, z27.s\n"
-                        "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmax z29.s, p7/m, z29.s, z18.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #3\n"
-                        "fmax z30.s, p7/m, z30.s, z18.s\n"
-                        "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmax z31.s, p7/m, z31.s, z18.s\n"
-                        "fmin z29.s, p7/m, z29.s, z19.s\n"
-                        "fmin z30.s, p7/m, z30.s, z19.s\n"
-                        "st1w z6.s, p0, [c_ptr2]\n"
-                        "fmin z31.s, p7/m, z31.s, z19.s\n"
-                        "uzp1 z8.s, z28.s, z29.s\n"
-                        "uzp2 z9.s, z28.s, z29.s\n"
-                        "st1w z7.s, p0, [c_ptr3]\n"
-                        "uzp1 z10.s, z30.s, z31.s\n"
-                        "uzp2 z11.s, z30.s, z31.s\n"
-                        "st1w z8.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z9.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z10.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z11.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp
deleted file mode 100644
index f25f7473cb..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-class hybrid_bf16fp32_mmla_8VLx2
-{
-public:
-    typedef bfloat16 operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 4;
-    }
-
-    static unsigned int out_width()
-    {
-        return get_vector_length<float>() * 4;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 4;
-    }
-
-    static constexpr bool supports_accumulate()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 2, 8, 4> transforms = {};
-
-    // Default to the generic kernel
-    kern_type kernel=sve_hybrid_bf16fp32_mmla_8VLx2;
-
-    hybrid_bf16fp32_mmla_8VLx2(const CPUInfo *)
-    {
-
-    }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp
deleted file mode 100644
index f38a2ea2e3..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp
+++ /dev/null
@@ -1,2001 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 8) / 16) - 1;
-    K -= loops_count * 16;
-    const long regs_count = (K / 8) - 1;
-    K -= (regs_count + 1) * 8;
-    const long leftovers = K;
-    const long blocks_count = (K + 3) / 4;
-    float nullbias[256];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const bfloat16 * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(bfloat16);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const bfloat16 *a_ptr0 = a_ptr0_base;
-            const bfloat16 *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z1.h, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "zip1 z20.s, z15.s, z15.s\n"
-                        "zip2 z21.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "zip1 z22.s, z15.s, z15.s\n"
-                        "zip2 z23.s, z15.s, z15.s\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z14.s, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "mov z1.h, #0\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "mov z1.h, #0\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z1.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "uzp1 z1.s, z18.s, z19.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "uzp1 z2.s, z20.s, z21.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z2.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "uzp1 z3.s, z22.s, z23.s\n"
-                        "st1w z3.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "zip1 z20.s, z15.s, z15.s\n"
-                        "zip2 z21.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "zip1 z22.s, z15.s, z15.s\n"
-                        "zip2 z23.s, z15.s, z15.s\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z3.h, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip1 z20.s, z15.s, z15.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "zip2 z21.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "zip1 z22.s, z15.s, z15.s\n"
-                        "zip2 z23.s, z15.s, z15.s\n"
-                        "mov z28.d, z20.d\n"
-                        "mov z29.d, z21.d\n"
-                        "mov z30.d, z22.d\n"
-                        "mov z31.d, z23.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z3.h, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z28.s, z13.s, z14.s\n"
-                        "zip2 z29.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z30.s, z13.s, z14.s\n"
-                        "zip2 z31.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "uzp1 z9.s, z26.s, z27.s\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "st1w z8.s, p0, [c_ptr2]\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "st1w z9.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "uzp1 z10.s, z28.s, z29.s\n"
-                        "uzp1 z11.s, z30.s, z31.s\n"
-                        "st1w z10.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z11.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip1 z20.s, z15.s, z15.s\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip2 z21.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z28.d, z20.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z22.s, z15.s, z15.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "zip2 z23.s, z15.s, z15.s\n"
-                        "mov z29.d, z21.d\n"
-                        "mov z30.d, z22.d\n"
-                        "mov z31.d, z23.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "zip1 z28.s, z13.s, z14.s\n"
-                        "zip2 z29.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1w z14.s, p3/z, [c_ptr3, #3, MUL VL]\n"
-                        "zip1 z30.s, z13.s, z14.s\n"
-                        "zip2 z31.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr3]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr2]\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "st1w z9.s, p0, [c_ptr3]\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "st1w z10.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "uzp1 z12.s, z28.s, z29.s\n"
-                        "st1w z11.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp2 z13.s, z28.s, z29.s\n"
-                        "uzp1 z14.s, z30.s, z31.s\n"
-                        "uzp2 z15.s, z30.s, z31.s\n"
-                        "st1w z12.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z13.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z14.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z15.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
deleted file mode 100644
index 7610a20ac0..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,3778 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 *C, int ldc, int M, int N, int K, const __fp16 *bias, Activation act, bool accumulate) {
-    const int K_stride = K;
-    const long loops_count = ((K + 8) / 16) - 1;
-    K -= loops_count * 16;
-    const long regs_count = (K / 8) - 1;
-    K -= (regs_count + 1) * 8;
-    const long leftovers = K;
-    __fp16 nullbias[512];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (4 * get_vector_length<__fp16>() * sizeof(__fp16)));
-    }
-    __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
-    __fp16 maxval =   static_cast<__fp16>(std::numeric_limits<float>::infinity());
-    const __fp16 * const minptr = &minval;
-    const __fp16 * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<__fp16>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const __fp16 * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(__fp16);
-
-        __fp16 *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = leftovers;
-            const __fp16 *a_ptr0 = a_ptr0_base;
-            const __fp16 *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(__fp16);
-            const __fp16 *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p2.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p3.h, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1h z16.h, p0/z, [%[biasptr]]\n"
-                        "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
-                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "5:\n"
-                        "ld1rh z14.h, p7/z, [%[minptr]]\n"
-                        "ld1rh z15.h, p7/z, [%[maxptr]]\n"
-                        "fmax z16.h, p7/m, z16.h, z14.h\n"
-                        "fmax z17.h, p7/m, z17.h, z14.h\n"
-                        "fmax z18.h, p7/m, z18.h, z14.h\n"
-                        "fmax z19.h, p7/m, z19.h, z14.h\n"
-                        "fmin z16.h, p7/m, z16.h, z15.h\n"
-                        "fmin z17.h, p7/m, z17.h, z15.h\n"
-                        "fmin z18.h, p7/m, z18.h, z15.h\n"
-                        "fmin z19.h, p7/m, z19.h, z15.h\n"
-                        "st1h z16.h, p0, [%[c_ptr0]]\n"
-                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p2.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p3.h, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1h z16.h, p0/z, [%[biasptr]]\n"
-                        "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
-                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1h z20.h, p0/z, [c_ptr1]\n"
-                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "5:\n"
-                        "ld1rh z14.h, p7/z, [%[minptr]]\n"
-                        "ld1rh z15.h, p7/z, [%[maxptr]]\n"
-                        "fmax z16.h, p7/m, z16.h, z14.h\n"
-                        "fmax z17.h, p7/m, z17.h, z14.h\n"
-                        "fmax z18.h, p7/m, z18.h, z14.h\n"
-                        "fmax z19.h, p7/m, z19.h, z14.h\n"
-                        "fmin z16.h, p7/m, z16.h, z15.h\n"
-                        "fmin z17.h, p7/m, z17.h, z15.h\n"
-                        "fmin z18.h, p7/m, z18.h, z15.h\n"
-                        "fmin z19.h, p7/m, z19.h, z15.h\n"
-                        "st1h z16.h, p0, [%[c_ptr0]]\n"
-                        "fmax z20.h, p7/m, z20.h, z14.h\n"
-                        "fmax z21.h, p7/m, z21.h, z14.h\n"
-                        "fmax z22.h, p7/m, z22.h, z14.h\n"
-                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.h, p7/m, z23.h, z14.h\n"
-                        "fmin z20.h, p7/m, z20.h, z15.h\n"
-                        "fmin z21.h, p7/m, z21.h, z15.h\n"
-                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.h, p7/m, z22.h, z15.h\n"
-                        "fmin z23.h, p7/m, z23.h, z15.h\n"
-                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1h z20.h, p0, [c_ptr1]\n"
-                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p2.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p3.h, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1h z16.h, p0/z, [%[biasptr]]\n"
-                        "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
-                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1h z20.h, p0/z, [c_ptr1]\n"
-                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1h z24.h, p0/z, [c_ptr2]\n"
-                        "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z24.h, z12.h, z6.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z25.h, z13.h, z6.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z26.h, z14.h, z6.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "fmla z27.h, z15.h, z6.h[7]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z24.h, z12.h, z6.h[7]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z25.h, z13.h, z6.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z26.h, z14.h, z6.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "fmla z27.h, z15.h, z6.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "5:\n"
-                        "ld1rh z14.h, p7/z, [%[minptr]]\n"
-                        "ld1rh z15.h, p7/z, [%[maxptr]]\n"
-                        "fmax z16.h, p7/m, z16.h, z14.h\n"
-                        "fmax z17.h, p7/m, z17.h, z14.h\n"
-                        "fmax z18.h, p7/m, z18.h, z14.h\n"
-                        "fmax z19.h, p7/m, z19.h, z14.h\n"
-                        "fmin z16.h, p7/m, z16.h, z15.h\n"
-                        "fmin z17.h, p7/m, z17.h, z15.h\n"
-                        "fmin z18.h, p7/m, z18.h, z15.h\n"
-                        "fmin z19.h, p7/m, z19.h, z15.h\n"
-                        "st1h z16.h, p0, [%[c_ptr0]]\n"
-                        "fmax z20.h, p7/m, z20.h, z14.h\n"
-                        "fmax z21.h, p7/m, z21.h, z14.h\n"
-                        "fmax z22.h, p7/m, z22.h, z14.h\n"
-                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.h, p7/m, z23.h, z14.h\n"
-                        "fmin z20.h, p7/m, z20.h, z15.h\n"
-                        "fmin z21.h, p7/m, z21.h, z15.h\n"
-                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.h, p7/m, z22.h, z15.h\n"
-                        "fmin z23.h, p7/m, z23.h, z15.h\n"
-                        "fmax z24.h, p7/m, z24.h, z14.h\n"
-                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.h, p7/m, z25.h, z14.h\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.h, p7/m, z26.h, z14.h\n"
-                        "st1h z20.h, p0, [c_ptr1]\n"
-                        "fmin z24.h, p7/m, z24.h, z15.h\n"
-                        "fmin z25.h, p7/m, z25.h, z15.h\n"
-                        "fmax z27.h, p7/m, z27.h, z14.h\n"
-                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.h, p7/m, z26.h, z15.h\n"
-                        "fmin z27.h, p7/m, z27.h, z15.h\n"
-                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1h z24.h, p0, [c_ptr2]\n"
-                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p2.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p3.h, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1h z16.h, p0/z, [%[biasptr]]\n"
-                        "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z28.d, z16.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z29.d, z17.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z30.d, z18.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z31.d, z19.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
-                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1h z20.h, p0/z, [c_ptr1]\n"
-                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1h z24.h, p0/z, [c_ptr2]\n"
-                        "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1h z28.h, p0/z, [c_ptr3]\n"
-                        "ld1h z29.h, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "ld1h z30.h, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "ld1h z31.h, p3/z, [c_ptr3, #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z28.h, z8.h, z3.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z29.h, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla z30.h, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "fmla z31.h, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z28.h, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z29.h, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z30.h, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "fmla z31.h, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z28.h, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z29.h, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z30.h, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "fmla z31.h, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z28.h, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z29.h, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z30.h, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "fmla z31.h, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z28.h, z8.h, z3.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z29.h, z9.h, z3.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z30.h, z10.h, z3.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "fmla z31.h, z11.h, z3.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z28.h, z12.h, z3.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z29.h, z13.h, z3.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z30.h, z14.h, z3.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "fmla z31.h, z15.h, z3.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z28.h, z8.h, z3.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z29.h, z9.h, z3.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z30.h, z10.h, z3.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z31.h, z11.h, z3.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "fmla z28.h, z12.h, z3.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "fmla z29.h, z13.h, z3.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "fmla z30.h, z14.h, z3.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        "fmla z31.h, z15.h, z3.h[7]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "fmla z28.h, z8.h, z7.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "fmla z29.h, z9.h, z7.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "fmla z30.h, z10.h, z7.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "fmla z31.h, z11.h, z7.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "fmla z28.h, z12.h, z7.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "fmla z29.h, z13.h, z7.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "fmla z30.h, z14.h, z7.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "fmla z31.h, z15.h, z7.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z28.h, z8.h, z7.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "fmla z29.h, z9.h, z7.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "fmla z30.h, z10.h, z7.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "fmla z31.h, z11.h, z7.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "fmla z28.h, z12.h, z7.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "fmla z29.h, z13.h, z7.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "fmla z30.h, z14.h, z7.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "fmla z31.h, z15.h, z7.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "fmla z28.h, z8.h, z7.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "fmla z29.h, z9.h, z7.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "fmla z30.h, z10.h, z7.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "fmla z31.h, z11.h, z7.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "fmla z28.h, z12.h, z7.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "fmla z29.h, z13.h, z7.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "fmla z30.h, z14.h, z7.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "fmla z31.h, z15.h, z7.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z28.h, z8.h, z7.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z29.h, z9.h, z7.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z30.h, z10.h, z7.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "fmla z31.h, z11.h, z7.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z24.h, z12.h, z6.h[7]\n"
-                        "fmla z28.h, z12.h, z7.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z25.h, z13.h, z6.h[7]\n"
-                        "fmla z29.h, z13.h, z7.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z26.h, z14.h, z6.h[7]\n"
-                        "fmla z30.h, z14.h, z7.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "fmla z27.h, z15.h, z6.h[7]\n"
-                        "fmla z31.h, z15.h, z7.h[7]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z28.h, z8.h, z3.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "fmla z29.h, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "fmla z30.h, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "fmla z31.h, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z28.h, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z29.h, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z30.h, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "fmla z31.h, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z28.h, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z29.h, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z30.h, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "fmla z31.h, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z28.h, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z29.h, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z30.h, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "fmla z31.h, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z28.h, z8.h, z3.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z29.h, z9.h, z3.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z30.h, z10.h, z3.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "fmla z31.h, z11.h, z3.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z28.h, z12.h, z3.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z29.h, z13.h, z3.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z30.h, z14.h, z3.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "fmla z31.h, z15.h, z3.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z28.h, z8.h, z3.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z29.h, z9.h, z3.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z30.h, z10.h, z3.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z31.h, z11.h, z3.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "fmla z28.h, z12.h, z3.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "fmla z29.h, z13.h, z3.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "fmla z30.h, z14.h, z3.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        "fmla z31.h, z15.h, z3.h[7]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z28.h, z8.h, z7.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "fmla z29.h, z9.h, z7.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "fmla z30.h, z10.h, z7.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "fmla z31.h, z11.h, z7.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "fmla z28.h, z12.h, z7.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "fmla z29.h, z13.h, z7.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "fmla z30.h, z14.h, z7.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "fmla z31.h, z15.h, z7.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z28.h, z8.h, z7.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "fmla z29.h, z9.h, z7.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "fmla z30.h, z10.h, z7.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "fmla z31.h, z11.h, z7.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "fmla z28.h, z12.h, z7.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "fmla z29.h, z13.h, z7.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "fmla z30.h, z14.h, z7.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "fmla z31.h, z15.h, z7.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "fmla z28.h, z8.h, z7.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "fmla z29.h, z9.h, z7.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "fmla z30.h, z10.h, z7.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "fmla z31.h, z11.h, z7.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "fmla z28.h, z12.h, z7.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "fmla z29.h, z13.h, z7.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "fmla z30.h, z14.h, z7.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "fmla z31.h, z15.h, z7.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z28.h, z8.h, z7.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z29.h, z9.h, z7.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z30.h, z10.h, z7.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "fmla z31.h, z11.h, z7.h[6]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z24.h, z12.h, z6.h[7]\n"
-                        "fmla z28.h, z12.h, z7.h[7]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z25.h, z13.h, z6.h[7]\n"
-                        "fmla z29.h, z13.h, z7.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z26.h, z14.h, z6.h[7]\n"
-                        "fmla z30.h, z14.h, z7.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "fmla z27.h, z15.h, z6.h[7]\n"
-                        "fmla z31.h, z15.h, z7.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "fmla z28.h, z8.h, z3.h[0]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "fmla z29.h, z9.h, z3.h[0]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "fmla z30.h, z10.h, z3.h[0]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "fmla z31.h, z11.h, z3.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z28.h, z12.h, z3.h[1]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z29.h, z13.h, z3.h[1]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z30.h, z14.h, z3.h[1]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "fmla z31.h, z15.h, z3.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z28.h, z8.h, z3.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z29.h, z9.h, z3.h[2]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z30.h, z10.h, z3.h[2]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "fmla z31.h, z11.h, z3.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z28.h, z12.h, z3.h[3]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z29.h, z13.h, z3.h[3]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z30.h, z14.h, z3.h[3]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "fmla z31.h, z15.h, z3.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z28.h, z8.h, z3.h[4]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z29.h, z9.h, z3.h[4]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z30.h, z10.h, z3.h[4]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "fmla z31.h, z11.h, z3.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z28.h, z12.h, z3.h[5]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z29.h, z13.h, z3.h[5]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z30.h, z14.h, z3.h[5]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "fmla z31.h, z15.h, z3.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z28.h, z8.h, z3.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z29.h, z9.h, z3.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z30.h, z10.h, z3.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z31.h, z11.h, z3.h[6]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        "fmla z28.h, z8.h, z3.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z29.h, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        "fmla z30.h, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "fmla z31.h, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z28.h, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z29.h, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z30.h, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "fmla z31.h, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z28.h, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z29.h, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z30.h, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "fmla z31.h, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z28.h, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z29.h, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z30.h, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "fmla z31.h, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z28.h, z8.h, z3.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z29.h, z9.h, z3.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z30.h, z10.h, z3.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "fmla z31.h, z11.h, z3.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z28.h, z12.h, z3.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z29.h, z13.h, z3.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z30.h, z14.h, z3.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "fmla z31.h, z15.h, z3.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z28.h, z8.h, z3.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z29.h, z9.h, z3.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z30.h, z10.h, z3.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z31.h, z11.h, z3.h[6]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "fmla z28.h, z12.h, z3.h[7]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "fmla z29.h, z13.h, z3.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "fmla z30.h, z14.h, z3.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "fmla z31.h, z15.h, z3.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "fmla z28.h, z8.h, z7.h[0]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "fmla z29.h, z9.h, z7.h[0]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "fmla z30.h, z10.h, z7.h[0]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "fmla z31.h, z11.h, z7.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "fmla z28.h, z12.h, z7.h[1]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "fmla z29.h, z13.h, z7.h[1]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "fmla z30.h, z14.h, z7.h[1]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "fmla z31.h, z15.h, z7.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z28.h, z8.h, z7.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "fmla z29.h, z9.h, z7.h[2]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "fmla z30.h, z10.h, z7.h[2]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "fmla z31.h, z11.h, z7.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "fmla z28.h, z12.h, z7.h[3]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "fmla z29.h, z13.h, z7.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "fmla z30.h, z14.h, z7.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "fmla z31.h, z15.h, z7.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "fmla z28.h, z8.h, z7.h[4]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "fmla z29.h, z9.h, z7.h[4]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "fmla z30.h, z10.h, z7.h[4]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "fmla z31.h, z11.h, z7.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "fmla z28.h, z12.h, z7.h[5]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "fmla z29.h, z13.h, z7.h[5]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "fmla z30.h, z14.h, z7.h[5]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "fmla z31.h, z15.h, z7.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z28.h, z8.h, z7.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z29.h, z9.h, z7.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z30.h, z10.h, z7.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "fmla z31.h, z11.h, z7.h[6]\n"
-                        "5:\n"
-                        "ld1rh z14.h, p7/z, [%[minptr]]\n"
-                        "ld1rh z15.h, p7/z, [%[maxptr]]\n"
-                        "fmax z16.h, p7/m, z16.h, z14.h\n"
-                        "fmax z17.h, p7/m, z17.h, z14.h\n"
-                        "fmax z18.h, p7/m, z18.h, z14.h\n"
-                        "fmax z19.h, p7/m, z19.h, z14.h\n"
-                        "fmin z16.h, p7/m, z16.h, z15.h\n"
-                        "fmin z17.h, p7/m, z17.h, z15.h\n"
-                        "fmin z18.h, p7/m, z18.h, z15.h\n"
-                        "fmin z19.h, p7/m, z19.h, z15.h\n"
-                        "st1h z16.h, p0, [%[c_ptr0]]\n"
-                        "fmax z20.h, p7/m, z20.h, z14.h\n"
-                        "fmax z21.h, p7/m, z21.h, z14.h\n"
-                        "fmax z22.h, p7/m, z22.h, z14.h\n"
-                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.h, p7/m, z23.h, z14.h\n"
-                        "fmin z20.h, p7/m, z20.h, z15.h\n"
-                        "fmin z21.h, p7/m, z21.h, z15.h\n"
-                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.h, p7/m, z22.h, z15.h\n"
-                        "fmin z23.h, p7/m, z23.h, z15.h\n"
-                        "fmax z24.h, p7/m, z24.h, z14.h\n"
-                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.h, p7/m, z25.h, z14.h\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.h, p7/m, z26.h, z14.h\n"
-                        "st1h z20.h, p0, [c_ptr1]\n"
-                        "fmin z24.h, p7/m, z24.h, z15.h\n"
-                        "fmin z25.h, p7/m, z25.h, z15.h\n"
-                        "fmax z27.h, p7/m, z27.h, z14.h\n"
-                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.h, p7/m, z26.h, z15.h\n"
-                        "fmax z28.h, p7/m, z28.h, z14.h\n"
-                        "fmax z29.h, p7/m, z29.h, z14.h\n"
-                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmin z27.h, p7/m, z27.h, z15.h\n"
-                        "fmax z30.h, p7/m, z30.h, z14.h\n"
-                        "fmin z28.h, p7/m, z28.h, z15.h\n"
-                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
-                        "fmin z29.h, p7/m, z29.h, z15.h\n"
-                        "fmax z31.h, p7/m, z31.h, z14.h\n"
-                        "fmin z30.h, p7/m, z30.h, z15.h\n"
-                        "st1h z24.h, p0, [c_ptr2]\n"
-                        "fmin z31.h, p7/m, z31.h, z15.h\n"
-                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1h z28.h, p0, [c_ptr3]\n"
-                        "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
index ebef413848..0260050f29 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,42 +10,48 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __ARM_FEATURE_SVE
 
-
 #include "../std_transforms_sve.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<__fp16>, \
+   size_t, size_t, \
+   const __fp16 *, \
+   IndirectOutputArg<__fp16>, \
+   const __fp16 *, Activation, bool
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_hybrid_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, __fp16 *, int, int, int, int, const __fp16 *, Activation, bool);
+void sve_hybrid_fp16_mla_6x4VL( ARGLIST );
 
-class hybrid_fp16_mla_4VLx4
+class cls_sve_hybrid_fp16_mla_6x4VL
 {
 public:
     typedef __fp16 operand_type;
     typedef __fp16 result_type;
 
-    typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, __fp16 *, int, int, int, int, const __fp16 *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
     {
-        return 4;
+        return 6;
     }
 
     static unsigned int out_width()
@@ -63,27 +69,17 @@ public:
         return true;
     }
 
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 6, 4, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_hybrid_fp16_mla_4VLx4;
+    kern_type kernel=sve_hybrid_fp16_mla_6x4VL;
 
-    hybrid_fp16_mla_4VLx4(const CPUInfo *)
+    cls_sve_hybrid_fp16_mla_6x4VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..b19842b122
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
@@ -0,0 +1,3178 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp16_mla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+    size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg,
+    const __fp16 *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const __fp16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 71f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 57f\n"
+      "beq 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p3.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x16\n"
+      "cbz x14, 4f\n"
+      "ld1h { z8.h }, p5/Z, [x14]\n"
+      "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 9f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "cmp x11, #0x8\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "cmp x11, #0x8\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "12:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 7b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 13f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z1.h }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z0.h }, p5/Z, [x19]\n"
+      "fmin z8.h, p5/M, z8.h, z0.h\n"
+      "fmin z9.h, p5/M, z9.h, z0.h\n"
+      "fmin z10.h, p5/M, z10.h, z0.h\n"
+      "fmin z11.h, p5/M, z11.h, z0.h\n"
+      "fmax z8.h, p5/M, z8.h, z1.h\n"
+      "fmax z9.h, p5/M, z9.h, z1.h\n"
+      "fmax z10.h, p5/M, z10.h, z1.h\n"
+      "fmax z11.h, p5/M, z11.h, z1.h\n"
+      "13:"  // Height 1: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "14:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "inch x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 3b\n"
+      "b 86f\n"
+      "15:"  // Height 2
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 16f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "17:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p3.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x16\n"
+      "cbz x14, 18f\n"
+      "ld1h { z8.h }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "b 20f\n"
+      "18:"  // Height 2: no bias
+      "tbz %x[flags], #0, 19f\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x9]\n"
+      "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "b 20f\n"
+      "19:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "20:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "21:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 22f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 23f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "b 23f\n"
+      "22:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "23:"  // Height 2: input setup done
+      "cmp x11, #0x8\n"
+      "ble 25f\n"
+      "24:"  // Height 2: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "cmp x11, #0x8\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "bgt 24b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 21b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z1.h }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z0.h }, p5/Z, [x19]\n"
+      "fmin z8.h, p5/M, z8.h, z0.h\n"
+      "fmin z9.h, p5/M, z9.h, z0.h\n"
+      "fmin z10.h, p5/M, z10.h, z0.h\n"
+      "fmin z11.h, p5/M, z11.h, z0.h\n"
+      "fmin z12.h, p5/M, z12.h, z0.h\n"
+      "fmax z8.h, p5/M, z8.h, z1.h\n"
+      "fmax z9.h, p5/M, z9.h, z1.h\n"
+      "fmax z10.h, p5/M, z10.h, z1.h\n"
+      "fmax z11.h, p5/M, z11.h, z1.h\n"
+      "fmax z12.h, p5/M, z12.h, z1.h\n"
+      "fmin z13.h, p5/M, z13.h, z0.h\n"
+      "fmin z14.h, p5/M, z14.h, z0.h\n"
+      "fmin z15.h, p5/M, z15.h, z0.h\n"
+      "fmax z13.h, p5/M, z13.h, z1.h\n"
+      "fmax z14.h, p5/M, z14.h, z1.h\n"
+      "fmax z15.h, p5/M, z15.h, z1.h\n"
+      "27:"  // Height 2: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x9]\n"
+      "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "28:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "inch x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 17b\n"
+      "b 86f\n"
+      "29:"  // Height 3
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 30f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "add x27, x27, x19, LSL #1\n"
+      "b 31f\n"
+      "30:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "31:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p3.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x16\n"
+      "cbz x14, 32f\n"
+      "ld1h { z8.h }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+      "mov z13.d, z9.d\n"
+      "addvl x14, x14, #4\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 34f\n"
+      "32:"  // Height 3: no bias
+      "tbz %x[flags], #0, 33f\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x9]\n"
+      "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x27]\n"
+      "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+      "b 34f\n"
+      "33:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "37:"  // Height 3: input setup done
+      "cmp x11, #0x8\n"
+      "ble 39f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "cmp x11, #0x8\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "bgt 38b\n"
+      "39:"  // Height 3: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "40:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 35b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 41f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z1.h }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z0.h }, p5/Z, [x19]\n"
+      "fmin z8.h, p5/M, z8.h, z0.h\n"
+      "fmin z9.h, p5/M, z9.h, z0.h\n"
+      "fmin z10.h, p5/M, z10.h, z0.h\n"
+      "fmin z11.h, p5/M, z11.h, z0.h\n"
+      "fmin z12.h, p5/M, z12.h, z0.h\n"
+      "fmax z8.h, p5/M, z8.h, z1.h\n"
+      "fmax z9.h, p5/M, z9.h, z1.h\n"
+      "fmax z10.h, p5/M, z10.h, z1.h\n"
+      "fmax z11.h, p5/M, z11.h, z1.h\n"
+      "fmax z12.h, p5/M, z12.h, z1.h\n"
+      "fmin z13.h, p5/M, z13.h, z0.h\n"
+      "fmin z14.h, p5/M, z14.h, z0.h\n"
+      "fmin z15.h, p5/M, z15.h, z0.h\n"
+      "fmin z16.h, p5/M, z16.h, z0.h\n"
+      "fmax z13.h, p5/M, z13.h, z1.h\n"
+      "fmax z14.h, p5/M, z14.h, z1.h\n"
+      "fmax z15.h, p5/M, z15.h, z1.h\n"
+      "fmax z16.h, p5/M, z16.h, z1.h\n"
+      "fmin z17.h, p5/M, z17.h, z0.h\n"
+      "fmin z18.h, p5/M, z18.h, z0.h\n"
+      "fmin z19.h, p5/M, z19.h, z0.h\n"
+      "fmax z17.h, p5/M, z17.h, z1.h\n"
+      "fmax z18.h, p5/M, z18.h, z1.h\n"
+      "fmax z19.h, p5/M, z19.h, z1.h\n"
+      "41:"  // Height 3: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x9]\n"
+      "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z16.h }, p4, [x27]\n"
+      "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "42:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "inch x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 31b\n"
+      "b 86f\n"
+      "43:"  // Height 4
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 44f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #1\n"
+      "add x25, x25, x19, LSL #1\n"
+      "b 45f\n"
+      "44:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "add x25, x27, x19, LSL #1\n"
+      "45:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p3.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x16\n"
+      "cbz x14, 46f\n"
+      "ld1h { z8.h }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 48f\n"
+      "46:"  // Height 4: no bias
+      "tbz %x[flags], #0, 47f\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x9]\n"
+      "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x27]\n"
+      "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x25]\n"
+      "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
+      "b 48f\n"
+      "47:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "48:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "49:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 51f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "b 51f\n"
+      "50:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "51:"  // Height 4: input setup done
+      "cmp x11, #0x8\n"
+      "ble 53f\n"
+      "52:"  // Height 4: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x8\n"
+      "fmla z20.h, z6.h, z3.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z21.h, z7.h, z3.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z22.h, z6.h, z3.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z23.h, z7.h, z3.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "fmla z20.h, z6.h, z3.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "fmla z21.h, z7.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z22.h, z6.h, z3.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z23.h, z7.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "fmla z20.h, z6.h, z3.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "fmla z21.h, z7.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z22.h, z6.h, z3.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z23.h, z7.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "fmla z20.h, z6.h, z3.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "fmla z21.h, z7.h, z3.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z22.h, z6.h, z3.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z23.h, z7.h, z3.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "fmla z20.h, z6.h, z3.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "fmla z21.h, z7.h, z3.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z22.h, z6.h, z3.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z23.h, z7.h, z3.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "fmla z20.h, z6.h, z3.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "fmla z21.h, z7.h, z3.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z22.h, z6.h, z3.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z23.h, z7.h, z3.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "fmla z20.h, z6.h, z3.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "fmla z21.h, z7.h, z3.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z22.h, z6.h, z3.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z23.h, z7.h, z3.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "fmla z20.h, z6.h, z3.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "fmla z21.h, z7.h, z3.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z22.h, z6.h, z3.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z23.h, z7.h, z3.h[7]\n"
+      "bgt 52b\n"
+      "53:"  // Height 4: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "fmla z20.h, z6.h, z3.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z21.h, z7.h, z3.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z22.h, z6.h, z3.h[0]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z23.h, z7.h, z3.h[0]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "fmla z20.h, z6.h, z3.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "fmla z21.h, z7.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z22.h, z6.h, z3.h[1]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z23.h, z7.h, z3.h[1]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "fmla z20.h, z6.h, z3.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "fmla z21.h, z7.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z22.h, z6.h, z3.h[2]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z23.h, z7.h, z3.h[2]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "fmla z20.h, z6.h, z3.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "fmla z21.h, z7.h, z3.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z22.h, z6.h, z3.h[3]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z23.h, z7.h, z3.h[3]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "fmla z20.h, z6.h, z3.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "fmla z21.h, z7.h, z3.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z22.h, z6.h, z3.h[4]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z23.h, z7.h, z3.h[4]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "fmla z20.h, z6.h, z3.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "fmla z21.h, z7.h, z3.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z22.h, z6.h, z3.h[5]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z23.h, z7.h, z3.h[5]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "fmla z20.h, z6.h, z3.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "fmla z21.h, z7.h, z3.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z22.h, z6.h, z3.h[6]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z23.h, z7.h, z3.h[6]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "fmla z20.h, z6.h, z3.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "fmla z21.h, z7.h, z3.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z22.h, z6.h, z3.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z23.h, z7.h, z3.h[7]\n"
+      "54:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 49b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 55f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z1.h }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z0.h }, p5/Z, [x19]\n"
+      "fmin z8.h, p5/M, z8.h, z0.h\n"
+      "fmin z9.h, p5/M, z9.h, z0.h\n"
+      "fmin z10.h, p5/M, z10.h, z0.h\n"
+      "fmin z11.h, p5/M, z11.h, z0.h\n"
+      "fmin z12.h, p5/M, z12.h, z0.h\n"
+      "fmax z8.h, p5/M, z8.h, z1.h\n"
+      "fmax z9.h, p5/M, z9.h, z1.h\n"
+      "fmax z10.h, p5/M, z10.h, z1.h\n"
+      "fmax z11.h, p5/M, z11.h, z1.h\n"
+      "fmax z12.h, p5/M, z12.h, z1.h\n"
+      "fmin z13.h, p5/M, z13.h, z0.h\n"
+      "fmin z14.h, p5/M, z14.h, z0.h\n"
+      "fmin z15.h, p5/M, z15.h, z0.h\n"
+      "fmin z16.h, p5/M, z16.h, z0.h\n"
+      "fmax z13.h, p5/M, z13.h, z1.h\n"
+      "fmax z14.h, p5/M, z14.h, z1.h\n"
+      "fmax z15.h, p5/M, z15.h, z1.h\n"
+      "fmax z16.h, p5/M, z16.h, z1.h\n"
+      "fmin z17.h, p5/M, z17.h, z0.h\n"
+      "fmin z18.h, p5/M, z18.h, z0.h\n"
+      "fmin z19.h, p5/M, z19.h, z0.h\n"
+      "fmin z20.h, p5/M, z20.h, z0.h\n"
+      "fmax z17.h, p5/M, z17.h, z1.h\n"
+      "fmax z18.h, p5/M, z18.h, z1.h\n"
+      "fmax z19.h, p5/M, z19.h, z1.h\n"
+      "fmax z20.h, p5/M, z20.h, z1.h\n"
+      "fmin z21.h, p5/M, z21.h, z0.h\n"
+      "fmin z22.h, p5/M, z22.h, z0.h\n"
+      "fmin z23.h, p5/M, z23.h, z0.h\n"
+      "fmax z21.h, p5/M, z21.h, z1.h\n"
+      "fmax z22.h, p5/M, z22.h, z1.h\n"
+      "fmax z23.h, p5/M, z23.h, z1.h\n"
+      "55:"  // Height 4: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x9]\n"
+      "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z16.h }, p4, [x27]\n"
+      "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1h { z20.h }, p4, [x25]\n"
+      "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "56:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "inch x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 45b\n"
+      "b 86f\n"
+      "57:"  // Height 5
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 58f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #1\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "b 59f\n"
+      "58:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "add x25, x27, x19, LSL #1\n"
+      "add x23, x25, x19, LSL #1\n"
+      "59:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p3.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x16\n"
+      "cbz x14, 60f\n"
+      "ld1h { z8.h }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 62f\n"
+      "60:"  // Height 5: no bias
+      "tbz %x[flags], #0, 61f\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x9]\n"
+      "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x27]\n"
+      "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x25]\n"
+      "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x23]\n"
+      "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "b 62f\n"
+      "61:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "62:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "63:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 64f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 65f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "b 65f\n"
+      "64:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "65:"  // Height 5: input setup done
+      "cmp x11, #0x8\n"
+      "ble 67f\n"
+      "66:"  // Height 5: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z20.h, z6.h, z3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x8\n"
+      "fmla z24.h, z6.h, z4.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z21.h, z7.h, z3.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z25.h, z7.h, z4.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z22.h, z6.h, z3.h[0]\n"
+      "fmla z26.h, z6.h, z4.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z23.h, z7.h, z3.h[0]\n"
+      "fmla z27.h, z7.h, z4.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "fmla z20.h, z6.h, z3.h[1]\n"
+      "fmla z24.h, z6.h, z4.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "fmla z21.h, z7.h, z3.h[1]\n"
+      "fmla z25.h, z7.h, z4.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z22.h, z6.h, z3.h[1]\n"
+      "fmla z26.h, z6.h, z4.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z23.h, z7.h, z3.h[1]\n"
+      "fmla z27.h, z7.h, z4.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "fmla z20.h, z6.h, z3.h[2]\n"
+      "fmla z24.h, z6.h, z4.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "fmla z21.h, z7.h, z3.h[2]\n"
+      "fmla z25.h, z7.h, z4.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z22.h, z6.h, z3.h[2]\n"
+      "fmla z26.h, z6.h, z4.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z23.h, z7.h, z3.h[2]\n"
+      "fmla z27.h, z7.h, z4.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "fmla z20.h, z6.h, z3.h[3]\n"
+      "fmla z24.h, z6.h, z4.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "fmla z21.h, z7.h, z3.h[3]\n"
+      "fmla z25.h, z7.h, z4.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z22.h, z6.h, z3.h[3]\n"
+      "fmla z26.h, z6.h, z4.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z23.h, z7.h, z3.h[3]\n"
+      "fmla z27.h, z7.h, z4.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "fmla z20.h, z6.h, z3.h[4]\n"
+      "fmla z24.h, z6.h, z4.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "fmla z21.h, z7.h, z3.h[4]\n"
+      "fmla z25.h, z7.h, z4.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z22.h, z6.h, z3.h[4]\n"
+      "fmla z26.h, z6.h, z4.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z23.h, z7.h, z3.h[4]\n"
+      "fmla z27.h, z7.h, z4.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "fmla z20.h, z6.h, z3.h[5]\n"
+      "fmla z24.h, z6.h, z4.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "fmla z21.h, z7.h, z3.h[5]\n"
+      "fmla z25.h, z7.h, z4.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z22.h, z6.h, z3.h[5]\n"
+      "fmla z26.h, z6.h, z4.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z23.h, z7.h, z3.h[5]\n"
+      "fmla z27.h, z7.h, z4.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "fmla z20.h, z6.h, z3.h[6]\n"
+      "fmla z24.h, z6.h, z4.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "fmla z21.h, z7.h, z3.h[6]\n"
+      "fmla z25.h, z7.h, z4.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z22.h, z6.h, z3.h[6]\n"
+      "fmla z26.h, z6.h, z4.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z23.h, z7.h, z3.h[6]\n"
+      "fmla z27.h, z7.h, z4.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "fmla z20.h, z6.h, z3.h[7]\n"
+      "fmla z24.h, z6.h, z4.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "fmla z21.h, z7.h, z3.h[7]\n"
+      "fmla z25.h, z7.h, z4.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z22.h, z6.h, z3.h[7]\n"
+      "fmla z26.h, z6.h, z4.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z27.h, z7.h, z4.h[7]\n"
+      "bgt 66b\n"
+      "67:"  // Height 5: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "fmla z20.h, z6.h, z3.h[0]\n"
+      "fmla z24.h, z6.h, z4.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z21.h, z7.h, z3.h[0]\n"
+      "fmla z25.h, z7.h, z4.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z22.h, z6.h, z3.h[0]\n"
+      "fmla z26.h, z6.h, z4.h[0]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z23.h, z7.h, z3.h[0]\n"
+      "fmla z27.h, z7.h, z4.h[0]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "fmla z20.h, z6.h, z3.h[1]\n"
+      "fmla z24.h, z6.h, z4.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "fmla z21.h, z7.h, z3.h[1]\n"
+      "fmla z25.h, z7.h, z4.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z22.h, z6.h, z3.h[1]\n"
+      "fmla z26.h, z6.h, z4.h[1]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z23.h, z7.h, z3.h[1]\n"
+      "fmla z27.h, z7.h, z4.h[1]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "fmla z20.h, z6.h, z3.h[2]\n"
+      "fmla z24.h, z6.h, z4.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "fmla z21.h, z7.h, z3.h[2]\n"
+      "fmla z25.h, z7.h, z4.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z22.h, z6.h, z3.h[2]\n"
+      "fmla z26.h, z6.h, z4.h[2]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z23.h, z7.h, z3.h[2]\n"
+      "fmla z27.h, z7.h, z4.h[2]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "fmla z20.h, z6.h, z3.h[3]\n"
+      "fmla z24.h, z6.h, z4.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "fmla z21.h, z7.h, z3.h[3]\n"
+      "fmla z25.h, z7.h, z4.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z22.h, z6.h, z3.h[3]\n"
+      "fmla z26.h, z6.h, z4.h[3]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z23.h, z7.h, z3.h[3]\n"
+      "fmla z27.h, z7.h, z4.h[3]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "fmla z20.h, z6.h, z3.h[4]\n"
+      "fmla z24.h, z6.h, z4.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "fmla z21.h, z7.h, z3.h[4]\n"
+      "fmla z25.h, z7.h, z4.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z22.h, z6.h, z3.h[4]\n"
+      "fmla z26.h, z6.h, z4.h[4]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z23.h, z7.h, z3.h[4]\n"
+      "fmla z27.h, z7.h, z4.h[4]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "fmla z20.h, z6.h, z3.h[5]\n"
+      "fmla z24.h, z6.h, z4.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "fmla z21.h, z7.h, z3.h[5]\n"
+      "fmla z25.h, z7.h, z4.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z22.h, z6.h, z3.h[5]\n"
+      "fmla z26.h, z6.h, z4.h[5]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z23.h, z7.h, z3.h[5]\n"
+      "fmla z27.h, z7.h, z4.h[5]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "fmla z20.h, z6.h, z3.h[6]\n"
+      "fmla z24.h, z6.h, z4.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "fmla z21.h, z7.h, z3.h[6]\n"
+      "fmla z25.h, z7.h, z4.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z22.h, z6.h, z3.h[6]\n"
+      "fmla z26.h, z6.h, z4.h[6]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z23.h, z7.h, z3.h[6]\n"
+      "fmla z27.h, z7.h, z4.h[6]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "fmla z20.h, z6.h, z3.h[7]\n"
+      "fmla z24.h, z6.h, z4.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "fmla z21.h, z7.h, z3.h[7]\n"
+      "fmla z25.h, z7.h, z4.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z22.h, z6.h, z3.h[7]\n"
+      "fmla z26.h, z6.h, z4.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z27.h, z7.h, z4.h[7]\n"
+      "68:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 63b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 69f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z1.h }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z0.h }, p5/Z, [x19]\n"
+      "fmin z8.h, p5/M, z8.h, z0.h\n"
+      "fmin z9.h, p5/M, z9.h, z0.h\n"
+      "fmin z10.h, p5/M, z10.h, z0.h\n"
+      "fmin z11.h, p5/M, z11.h, z0.h\n"
+      "fmin z12.h, p5/M, z12.h, z0.h\n"
+      "fmax z8.h, p5/M, z8.h, z1.h\n"
+      "fmax z9.h, p5/M, z9.h, z1.h\n"
+      "fmax z10.h, p5/M, z10.h, z1.h\n"
+      "fmax z11.h, p5/M, z11.h, z1.h\n"
+      "fmax z12.h, p5/M, z12.h, z1.h\n"
+      "fmin z13.h, p5/M, z13.h, z0.h\n"
+      "fmin z14.h, p5/M, z14.h, z0.h\n"
+      "fmin z15.h, p5/M, z15.h, z0.h\n"
+      "fmin z16.h, p5/M, z16.h, z0.h\n"
+      "fmax z13.h, p5/M, z13.h, z1.h\n"
+      "fmax z14.h, p5/M, z14.h, z1.h\n"
+      "fmax z15.h, p5/M, z15.h, z1.h\n"
+      "fmax z16.h, p5/M, z16.h, z1.h\n"
+      "fmin z17.h, p5/M, z17.h, z0.h\n"
+      "fmin z18.h, p5/M, z18.h, z0.h\n"
+      "fmin z19.h, p5/M, z19.h, z0.h\n"
+      "fmin z20.h, p5/M, z20.h, z0.h\n"
+      "fmax z17.h, p5/M, z17.h, z1.h\n"
+      "fmax z18.h, p5/M, z18.h, z1.h\n"
+      "fmax z19.h, p5/M, z19.h, z1.h\n"
+      "fmax z20.h, p5/M, z20.h, z1.h\n"
+      "fmin z21.h, p5/M, z21.h, z0.h\n"
+      "fmin z22.h, p5/M, z22.h, z0.h\n"
+      "fmin z23.h, p5/M, z23.h, z0.h\n"
+      "fmin z24.h, p5/M, z24.h, z0.h\n"
+      "fmax z21.h, p5/M, z21.h, z1.h\n"
+      "fmax z22.h, p5/M, z22.h, z1.h\n"
+      "fmax z23.h, p5/M, z23.h, z1.h\n"
+      "fmax z24.h, p5/M, z24.h, z1.h\n"
+      "fmin z25.h, p5/M, z25.h, z0.h\n"
+      "fmin z26.h, p5/M, z26.h, z0.h\n"
+      "fmin z27.h, p5/M, z27.h, z0.h\n"
+      "fmax z25.h, p5/M, z25.h, z1.h\n"
+      "fmax z26.h, p5/M, z26.h, z1.h\n"
+      "fmax z27.h, p5/M, z27.h, z1.h\n"
+      "69:"  // Height 5: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x9]\n"
+      "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z16.h }, p4, [x27]\n"
+      "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1h { z20.h }, p4, [x25]\n"
+      "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1h { z24.h }, p4, [x23]\n"
+      "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
+      "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
+      "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "70:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "inch x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 59b\n"
+      "b 86f\n"
+      "71:"  // Height 6
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #1\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "add x21, x21, x19, LSL #1\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "add x25, x27, x19, LSL #1\n"
+      "add x23, x25, x19, LSL #1\n"
+      "add x21, x23, x19, LSL #1\n"
+      "add %x[output_ptr], x21, x19, LSL #1\n"
+      "73:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p3.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x16\n"
+      "cbz x14, 74f\n"
+      "ld1h { z8.h }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 76f\n"
+      "74:"  // Height 6: no bias
+      "tbz %x[flags], #0, 75f\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x9]\n"
+      "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x27]\n"
+      "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x25]\n"
+      "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x23]\n"
+      "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z28.h }, p4/Z, [x21]\n"
+      "ld1h { z29.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z30.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z31.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "b 76f\n"
+      "75:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "76:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "77:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 78f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 79f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x20, x20, x19, LSL #1\n"
+      "b 79f\n"
+      "78:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "add x20, x22, x19, LSL #1\n"
+      "79:"  // Height 6: input setup done
+      "cmp x11, #0x8\n"
+      "ble 81f\n"
+      "80:"  // Height 6: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "ld1rqh { z5.h }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z20.h, z6.h, z3.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z24.h, z6.h, z4.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x8\n"
+      "fmla z28.h, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z21.h, z7.h, z3.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z25.h, z7.h, z4.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla z29.h, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z22.h, z6.h, z3.h[0]\n"
+      "fmla z26.h, z6.h, z4.h[0]\n"
+      "fmla z30.h, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z23.h, z7.h, z3.h[0]\n"
+      "fmla z27.h, z7.h, z4.h[0]\n"
+      "fmla z31.h, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "fmla z20.h, z6.h, z3.h[1]\n"
+      "fmla z24.h, z6.h, z4.h[1]\n"
+      "fmla z28.h, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "fmla z21.h, z7.h, z3.h[1]\n"
+      "fmla z25.h, z7.h, z4.h[1]\n"
+      "fmla z29.h, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z22.h, z6.h, z3.h[1]\n"
+      "fmla z26.h, z6.h, z4.h[1]\n"
+      "fmla z30.h, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z23.h, z7.h, z3.h[1]\n"
+      "fmla z27.h, z7.h, z4.h[1]\n"
+      "fmla z31.h, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "fmla z20.h, z6.h, z3.h[2]\n"
+      "fmla z24.h, z6.h, z4.h[2]\n"
+      "fmla z28.h, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "fmla z21.h, z7.h, z3.h[2]\n"
+      "fmla z25.h, z7.h, z4.h[2]\n"
+      "fmla z29.h, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z22.h, z6.h, z3.h[2]\n"
+      "fmla z26.h, z6.h, z4.h[2]\n"
+      "fmla z30.h, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z23.h, z7.h, z3.h[2]\n"
+      "fmla z27.h, z7.h, z4.h[2]\n"
+      "fmla z31.h, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "fmla z20.h, z6.h, z3.h[3]\n"
+      "fmla z24.h, z6.h, z4.h[3]\n"
+      "fmla z28.h, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "fmla z21.h, z7.h, z3.h[3]\n"
+      "fmla z25.h, z7.h, z4.h[3]\n"
+      "fmla z29.h, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z22.h, z6.h, z3.h[3]\n"
+      "fmla z26.h, z6.h, z4.h[3]\n"
+      "fmla z30.h, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z23.h, z7.h, z3.h[3]\n"
+      "fmla z27.h, z7.h, z4.h[3]\n"
+      "fmla z31.h, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "fmla z20.h, z6.h, z3.h[4]\n"
+      "fmla z24.h, z6.h, z4.h[4]\n"
+      "fmla z28.h, z6.h, z5.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "fmla z21.h, z7.h, z3.h[4]\n"
+      "fmla z25.h, z7.h, z4.h[4]\n"
+      "fmla z29.h, z7.h, z5.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z22.h, z6.h, z3.h[4]\n"
+      "fmla z26.h, z6.h, z4.h[4]\n"
+      "fmla z30.h, z6.h, z5.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z23.h, z7.h, z3.h[4]\n"
+      "fmla z27.h, z7.h, z4.h[4]\n"
+      "fmla z31.h, z7.h, z5.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "fmla z20.h, z6.h, z3.h[5]\n"
+      "fmla z24.h, z6.h, z4.h[5]\n"
+      "fmla z28.h, z6.h, z5.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "fmla z21.h, z7.h, z3.h[5]\n"
+      "fmla z25.h, z7.h, z4.h[5]\n"
+      "fmla z29.h, z7.h, z5.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z22.h, z6.h, z3.h[5]\n"
+      "fmla z26.h, z6.h, z4.h[5]\n"
+      "fmla z30.h, z6.h, z5.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z23.h, z7.h, z3.h[5]\n"
+      "fmla z27.h, z7.h, z4.h[5]\n"
+      "fmla z31.h, z7.h, z5.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "fmla z20.h, z6.h, z3.h[6]\n"
+      "fmla z24.h, z6.h, z4.h[6]\n"
+      "fmla z28.h, z6.h, z5.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "fmla z21.h, z7.h, z3.h[6]\n"
+      "fmla z25.h, z7.h, z4.h[6]\n"
+      "fmla z29.h, z7.h, z5.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z22.h, z6.h, z3.h[6]\n"
+      "fmla z26.h, z6.h, z4.h[6]\n"
+      "fmla z30.h, z6.h, z5.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z23.h, z7.h, z3.h[6]\n"
+      "fmla z27.h, z7.h, z4.h[6]\n"
+      "fmla z31.h, z7.h, z5.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "fmla z20.h, z6.h, z3.h[7]\n"
+      "fmla z24.h, z6.h, z4.h[7]\n"
+      "fmla z28.h, z6.h, z5.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "fmla z21.h, z7.h, z3.h[7]\n"
+      "fmla z25.h, z7.h, z4.h[7]\n"
+      "fmla z29.h, z7.h, z5.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z22.h, z6.h, z3.h[7]\n"
+      "fmla z26.h, z6.h, z4.h[7]\n"
+      "fmla z30.h, z6.h, z5.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z27.h, z7.h, z4.h[7]\n"
+      "fmla z31.h, z7.h, z5.h[7]\n"
+      "bgt 80b\n"
+      "81:"  // Height 6: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "ld1rqh { z5.h }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z20.h, z6.h, z3.h[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "fmla z24.h, z6.h, z4.h[0]\n"
+      "fmla z28.h, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z21.h, z7.h, z3.h[0]\n"
+      "fmla z25.h, z7.h, z4.h[0]\n"
+      "fmla z29.h, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z22.h, z6.h, z3.h[0]\n"
+      "fmla z26.h, z6.h, z4.h[0]\n"
+      "fmla z30.h, z6.h, z5.h[0]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z23.h, z7.h, z3.h[0]\n"
+      "fmla z27.h, z7.h, z4.h[0]\n"
+      "fmla z31.h, z7.h, z5.h[0]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "fmla z20.h, z6.h, z3.h[1]\n"
+      "fmla z24.h, z6.h, z4.h[1]\n"
+      "fmla z28.h, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "fmla z21.h, z7.h, z3.h[1]\n"
+      "fmla z25.h, z7.h, z4.h[1]\n"
+      "fmla z29.h, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z22.h, z6.h, z3.h[1]\n"
+      "fmla z26.h, z6.h, z4.h[1]\n"
+      "fmla z30.h, z6.h, z5.h[1]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z23.h, z7.h, z3.h[1]\n"
+      "fmla z27.h, z7.h, z4.h[1]\n"
+      "fmla z31.h, z7.h, z5.h[1]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "fmla z20.h, z6.h, z3.h[2]\n"
+      "fmla z24.h, z6.h, z4.h[2]\n"
+      "fmla z28.h, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "fmla z21.h, z7.h, z3.h[2]\n"
+      "fmla z25.h, z7.h, z4.h[2]\n"
+      "fmla z29.h, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z22.h, z6.h, z3.h[2]\n"
+      "fmla z26.h, z6.h, z4.h[2]\n"
+      "fmla z30.h, z6.h, z5.h[2]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z23.h, z7.h, z3.h[2]\n"
+      "fmla z27.h, z7.h, z4.h[2]\n"
+      "fmla z31.h, z7.h, z5.h[2]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "fmla z20.h, z6.h, z3.h[3]\n"
+      "fmla z24.h, z6.h, z4.h[3]\n"
+      "fmla z28.h, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "fmla z21.h, z7.h, z3.h[3]\n"
+      "fmla z25.h, z7.h, z4.h[3]\n"
+      "fmla z29.h, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z22.h, z6.h, z3.h[3]\n"
+      "fmla z26.h, z6.h, z4.h[3]\n"
+      "fmla z30.h, z6.h, z5.h[3]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z23.h, z7.h, z3.h[3]\n"
+      "fmla z27.h, z7.h, z4.h[3]\n"
+      "fmla z31.h, z7.h, z5.h[3]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "fmla z20.h, z6.h, z3.h[4]\n"
+      "fmla z24.h, z6.h, z4.h[4]\n"
+      "fmla z28.h, z6.h, z5.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "fmla z21.h, z7.h, z3.h[4]\n"
+      "fmla z25.h, z7.h, z4.h[4]\n"
+      "fmla z29.h, z7.h, z5.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z22.h, z6.h, z3.h[4]\n"
+      "fmla z26.h, z6.h, z4.h[4]\n"
+      "fmla z30.h, z6.h, z5.h[4]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z23.h, z7.h, z3.h[4]\n"
+      "fmla z27.h, z7.h, z4.h[4]\n"
+      "fmla z31.h, z7.h, z5.h[4]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "fmla z20.h, z6.h, z3.h[5]\n"
+      "fmla z24.h, z6.h, z4.h[5]\n"
+      "fmla z28.h, z6.h, z5.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "fmla z21.h, z7.h, z3.h[5]\n"
+      "fmla z25.h, z7.h, z4.h[5]\n"
+      "fmla z29.h, z7.h, z5.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z22.h, z6.h, z3.h[5]\n"
+      "fmla z26.h, z6.h, z4.h[5]\n"
+      "fmla z30.h, z6.h, z5.h[5]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z23.h, z7.h, z3.h[5]\n"
+      "fmla z27.h, z7.h, z4.h[5]\n"
+      "fmla z31.h, z7.h, z5.h[5]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "fmla z20.h, z6.h, z3.h[6]\n"
+      "fmla z24.h, z6.h, z4.h[6]\n"
+      "fmla z28.h, z6.h, z5.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "fmla z21.h, z7.h, z3.h[6]\n"
+      "fmla z25.h, z7.h, z4.h[6]\n"
+      "fmla z29.h, z7.h, z5.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z22.h, z6.h, z3.h[6]\n"
+      "fmla z26.h, z6.h, z4.h[6]\n"
+      "fmla z30.h, z6.h, z5.h[6]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z23.h, z7.h, z3.h[6]\n"
+      "fmla z27.h, z7.h, z4.h[6]\n"
+      "fmla z31.h, z7.h, z5.h[6]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "fmla z20.h, z6.h, z3.h[7]\n"
+      "fmla z24.h, z6.h, z4.h[7]\n"
+      "fmla z28.h, z6.h, z5.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "fmla z21.h, z7.h, z3.h[7]\n"
+      "fmla z25.h, z7.h, z4.h[7]\n"
+      "fmla z29.h, z7.h, z5.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z22.h, z6.h, z3.h[7]\n"
+      "fmla z26.h, z6.h, z4.h[7]\n"
+      "fmla z30.h, z6.h, z5.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z27.h, z7.h, z4.h[7]\n"
+      "fmla z31.h, z7.h, z5.h[7]\n"
+      "82:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 77b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 83f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z1.h }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z0.h }, p5/Z, [x19]\n"
+      "fmin z8.h, p5/M, z8.h, z0.h\n"
+      "fmin z9.h, p5/M, z9.h, z0.h\n"
+      "fmin z10.h, p5/M, z10.h, z0.h\n"
+      "fmin z11.h, p5/M, z11.h, z0.h\n"
+      "fmin z12.h, p5/M, z12.h, z0.h\n"
+      "fmax z8.h, p5/M, z8.h, z1.h\n"
+      "fmax z9.h, p5/M, z9.h, z1.h\n"
+      "fmax z10.h, p5/M, z10.h, z1.h\n"
+      "fmax z11.h, p5/M, z11.h, z1.h\n"
+      "fmax z12.h, p5/M, z12.h, z1.h\n"
+      "fmin z13.h, p5/M, z13.h, z0.h\n"
+      "fmin z14.h, p5/M, z14.h, z0.h\n"
+      "fmin z15.h, p5/M, z15.h, z0.h\n"
+      "fmin z16.h, p5/M, z16.h, z0.h\n"
+      "fmax z13.h, p5/M, z13.h, z1.h\n"
+      "fmax z14.h, p5/M, z14.h, z1.h\n"
+      "fmax z15.h, p5/M, z15.h, z1.h\n"
+      "fmax z16.h, p5/M, z16.h, z1.h\n"
+      "fmin z17.h, p5/M, z17.h, z0.h\n"
+      "fmin z18.h, p5/M, z18.h, z0.h\n"
+      "fmin z19.h, p5/M, z19.h, z0.h\n"
+      "fmin z20.h, p5/M, z20.h, z0.h\n"
+      "fmax z17.h, p5/M, z17.h, z1.h\n"
+      "fmax z18.h, p5/M, z18.h, z1.h\n"
+      "fmax z19.h, p5/M, z19.h, z1.h\n"
+      "fmax z20.h, p5/M, z20.h, z1.h\n"
+      "fmin z21.h, p5/M, z21.h, z0.h\n"
+      "fmin z22.h, p5/M, z22.h, z0.h\n"
+      "fmin z23.h, p5/M, z23.h, z0.h\n"
+      "fmin z24.h, p5/M, z24.h, z0.h\n"
+      "fmax z21.h, p5/M, z21.h, z1.h\n"
+      "fmax z22.h, p5/M, z22.h, z1.h\n"
+      "fmax z23.h, p5/M, z23.h, z1.h\n"
+      "fmax z24.h, p5/M, z24.h, z1.h\n"
+      "fmin z25.h, p5/M, z25.h, z0.h\n"
+      "fmin z26.h, p5/M, z26.h, z0.h\n"
+      "fmin z27.h, p5/M, z27.h, z0.h\n"
+      "fmin z28.h, p5/M, z28.h, z0.h\n"
+      "fmax z25.h, p5/M, z25.h, z1.h\n"
+      "fmax z26.h, p5/M, z26.h, z1.h\n"
+      "fmax z27.h, p5/M, z27.h, z1.h\n"
+      "fmax z28.h, p5/M, z28.h, z1.h\n"
+      "fmin z29.h, p5/M, z29.h, z0.h\n"
+      "fmin z30.h, p5/M, z30.h, z0.h\n"
+      "fmin z31.h, p5/M, z31.h, z0.h\n"
+      "fmax z29.h, p5/M, z29.h, z1.h\n"
+      "fmax z30.h, p5/M, z30.h, z1.h\n"
+      "fmax z31.h, p5/M, z31.h, z1.h\n"
+      "83:"  // Height 6: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x9]\n"
+      "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z16.h }, p4, [x27]\n"
+      "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1h { z20.h }, p4, [x25]\n"
+      "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1h { z24.h }, p4, [x23]\n"
+      "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
+      "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
+      "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "st1h { z28.h }, p4, [x21]\n"
+      "st1h { z29.h }, p3, [x21, #1, MUL VL]\n"
+      "st1h { z30.h }, p2, [x21, #2, MUL VL]\n"
+      "st1h { z31.h }, p1, [x21, #3, MUL VL]\n"
+      "addvl x21, x21, #4\n"
+      "84:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "inch x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 73b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 86f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "85:"  // Update direct input
+      "mov x19, #0xc\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "86:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
deleted file mode 100644
index ce3624340e..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2118 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = K;
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long leftovers = K;
-    float nullbias[256];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = leftovers;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z24.s, z12.s, z6.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z25.s, z13.s, z6.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z26.s, z14.s, z6.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "fmla z27.s, z15.s, z6.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z24.s, z12.s, z6.s[3]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z25.s, z13.s, z6.s[3]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z26.s, z14.s, z6.s[3]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "fmla z27.s, z15.s, z6.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z28.d, z16.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z29.d, z17.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z30.d, z18.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z31.d, z19.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1w z28.s, p0/z, [c_ptr3]\n"
-                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z28.s, z8.s, z3.s[0]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z29.s, z9.s, z3.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla z30.s, z10.s, z3.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "fmla z31.s, z11.s, z3.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z28.s, z12.s, z3.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z29.s, z13.s, z3.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z30.s, z14.s, z3.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "fmla z31.s, z15.s, z3.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z28.s, z8.s, z3.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z29.s, z9.s, z3.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z30.s, z10.s, z3.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z31.s, z11.s, z3.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "fmla z28.s, z12.s, z3.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "fmla z29.s, z13.s, z3.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "fmla z30.s, z14.s, z3.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        "fmla z31.s, z15.s, z3.s[3]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "fmla z28.s, z8.s, z7.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "fmla z29.s, z9.s, z7.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "fmla z30.s, z10.s, z7.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "fmla z31.s, z11.s, z7.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "fmla z28.s, z12.s, z7.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "fmla z29.s, z13.s, z7.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "fmla z30.s, z14.s, z7.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "fmla z31.s, z15.s, z7.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z28.s, z8.s, z7.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z29.s, z9.s, z7.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z30.s, z10.s, z7.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "fmla z31.s, z11.s, z7.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z24.s, z12.s, z6.s[3]\n"
-                        "fmla z28.s, z12.s, z7.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z25.s, z13.s, z6.s[3]\n"
-                        "fmla z29.s, z13.s, z7.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z26.s, z14.s, z6.s[3]\n"
-                        "fmla z30.s, z14.s, z7.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "fmla z27.s, z15.s, z6.s[3]\n"
-                        "fmla z31.s, z15.s, z7.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z28.s, z8.s, z3.s[0]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "fmla z29.s, z9.s, z3.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "fmla z30.s, z10.s, z3.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "fmla z31.s, z11.s, z3.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z28.s, z12.s, z3.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z29.s, z13.s, z3.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z30.s, z14.s, z3.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "fmla z31.s, z15.s, z3.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z28.s, z8.s, z3.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z29.s, z9.s, z3.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z30.s, z10.s, z3.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z31.s, z11.s, z3.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "fmla z28.s, z12.s, z3.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "fmla z29.s, z13.s, z3.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "fmla z30.s, z14.s, z3.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        "fmla z31.s, z15.s, z3.s[3]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z28.s, z8.s, z7.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "fmla z29.s, z9.s, z7.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "fmla z30.s, z10.s, z7.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "fmla z31.s, z11.s, z7.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "fmla z28.s, z12.s, z7.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "fmla z29.s, z13.s, z7.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "fmla z30.s, z14.s, z7.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "fmla z31.s, z15.s, z7.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z28.s, z8.s, z7.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z29.s, z9.s, z7.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z30.s, z10.s, z7.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "fmla z31.s, z11.s, z7.s[2]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z24.s, z12.s, z6.s[3]\n"
-                        "fmla z28.s, z12.s, z7.s[3]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z25.s, z13.s, z6.s[3]\n"
-                        "fmla z29.s, z13.s, z7.s[3]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z26.s, z14.s, z6.s[3]\n"
-                        "fmla z30.s, z14.s, z7.s[3]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "fmla z27.s, z15.s, z6.s[3]\n"
-                        "fmla z31.s, z15.s, z7.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "fmla z28.s, z8.s, z3.s[0]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "fmla z29.s, z9.s, z3.s[0]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "fmla z30.s, z10.s, z3.s[0]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "fmla z31.s, z11.s, z3.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z28.s, z12.s, z3.s[1]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z29.s, z13.s, z3.s[1]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z30.s, z14.s, z3.s[1]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "fmla z31.s, z15.s, z3.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z28.s, z8.s, z3.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z29.s, z9.s, z3.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z30.s, z10.s, z3.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z31.s, z11.s, z3.s[2]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        "fmla z28.s, z8.s, z3.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z29.s, z9.s, z3.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        "fmla z30.s, z10.s, z3.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "fmla z31.s, z11.s, z3.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z28.s, z12.s, z3.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z29.s, z13.s, z3.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z30.s, z14.s, z3.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "fmla z31.s, z15.s, z3.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z28.s, z8.s, z3.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z29.s, z9.s, z3.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z30.s, z10.s, z3.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z31.s, z11.s, z3.s[2]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "fmla z28.s, z12.s, z3.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "fmla z29.s, z13.s, z3.s[3]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "fmla z30.s, z14.s, z3.s[3]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "fmla z31.s, z15.s, z3.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "fmla z28.s, z8.s, z7.s[0]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "fmla z29.s, z9.s, z7.s[0]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "fmla z30.s, z10.s, z7.s[0]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "fmla z31.s, z11.s, z7.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "fmla z28.s, z12.s, z7.s[1]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "fmla z29.s, z13.s, z7.s[1]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "fmla z30.s, z14.s, z7.s[1]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "fmla z31.s, z15.s, z7.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z28.s, z8.s, z7.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z29.s, z9.s, z7.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z30.s, z10.s, z7.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "fmla z31.s, z11.s, z7.s[2]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z28.s, p0, [c_ptr3]\n"
-                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
new file mode 100644
index 0000000000..f0cc70b76e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<float>, \
+   size_t, size_t, \
+   const float *, \
+   IndirectOutputArg<float>, \
+   const float *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_fp32_mla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_fp32_mla_6x4VL
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 6, 4, 1> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_fp32_mla_6x4VL;
+
+    cls_sve_hybrid_fp32_mla_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..3a6422abd1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
@@ -0,0 +1,2236 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 71f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 57f\n"
+      "beq 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 4f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 9f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "cmp x11, #0x4\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop head
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "cmp x11, #0x4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "ble 12f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "ble 12f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "ble 12f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "12:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 7b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 13f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "13:"  // Height 1: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "14:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 3b\n"
+      "b 86f\n"
+      "15:"  // Height 2
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 16f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "17:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 18f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "b 20f\n"
+      "18:"  // Height 2: no bias
+      "tbz %x[flags], #0, 19f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "b 20f\n"
+      "19:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "20:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "21:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 22f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 23f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "b 23f\n"
+      "22:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "23:"  // Height 2: input setup done
+      "cmp x11, #0x4\n"
+      "ble 25f\n"
+      "24:"  // Height 2: Multiply loop: Main loop head
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "cmp x11, #0x4\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "bgt 24b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "ble 26f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "ble 26f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "ble 26f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 21b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "27:"  // Height 2: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "28:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 17b\n"
+      "b 86f\n"
+      "29:"  // Height 3
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 30f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 31f\n"
+      "30:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "31:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 32f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "mov z13.d, z9.d\n"
+      "addvl x14, x14, #4\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 34f\n"
+      "32:"  // Height 3: no bias
+      "tbz %x[flags], #0, 33f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "b 34f\n"
+      "33:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "37:"  // Height 3: input setup done
+      "cmp x11, #0x4\n"
+      "ble 39f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "cmp x11, #0x4\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "bgt 38b\n"
+      "39:"  // Height 3: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "ble 40f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "ble 40f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "ble 40f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "40:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 35b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 41f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "41:"  // Height 3: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "42:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 31b\n"
+      "b 86f\n"
+      "43:"  // Height 4
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 44f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 45f\n"
+      "44:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "45:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 46f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 48f\n"
+      "46:"  // Height 4: no bias
+      "tbz %x[flags], #0, 47f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "b 48f\n"
+      "47:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "48:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "49:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 51f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "b 51f\n"
+      "50:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "51:"  // Height 4: input setup done
+      "cmp x11, #0x4\n"
+      "ble 53f\n"
+      "52:"  // Height 4: Multiply loop: Main loop head
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x4\n"
+      "fmla z20.s, z6.s, z3.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z21.s, z7.s, z3.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z22.s, z6.s, z3.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z23.s, z7.s, z3.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "fmla z20.s, z6.s, z3.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "fmla z21.s, z7.s, z3.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z22.s, z6.s, z3.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z23.s, z7.s, z3.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "fmla z20.s, z6.s, z3.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "fmla z21.s, z7.s, z3.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z22.s, z6.s, z3.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z23.s, z7.s, z3.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "fmla z20.s, z6.s, z3.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "fmla z21.s, z7.s, z3.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z22.s, z6.s, z3.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z23.s, z7.s, z3.s[3]\n"
+      "bgt 52b\n"
+      "53:"  // Height 4: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "fmla z20.s, z6.s, z3.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z21.s, z7.s, z3.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z22.s, z6.s, z3.s[0]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z23.s, z7.s, z3.s[0]\n"
+      "ble 54f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "fmla z20.s, z6.s, z3.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "fmla z21.s, z7.s, z3.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z22.s, z6.s, z3.s[1]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z23.s, z7.s, z3.s[1]\n"
+      "ble 54f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "fmla z20.s, z6.s, z3.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "fmla z21.s, z7.s, z3.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z22.s, z6.s, z3.s[2]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z23.s, z7.s, z3.s[2]\n"
+      "ble 54f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "fmla z20.s, z6.s, z3.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "fmla z21.s, z7.s, z3.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z22.s, z6.s, z3.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z23.s, z7.s, z3.s[3]\n"
+      "54:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 49b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 55f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z23.s, p5/M, z23.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "55:"  // Height 4: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "56:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 45b\n"
+      "b 86f\n"
+      "57:"  // Height 5
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 58f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 59f\n"
+      "58:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "59:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 60f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 62f\n"
+      "60:"  // Height 5: no bias
+      "tbz %x[flags], #0, 61f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "b 62f\n"
+      "61:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "62:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "63:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 64f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 65f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 65f\n"
+      "64:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "65:"  // Height 5: input setup done
+      "cmp x11, #0x4\n"
+      "ble 67f\n"
+      "66:"  // Height 5: Multiply loop: Main loop head
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z20.s, z6.s, z3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x4\n"
+      "fmla z24.s, z6.s, z4.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z21.s, z7.s, z3.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z25.s, z7.s, z4.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z22.s, z6.s, z3.s[0]\n"
+      "fmla z26.s, z6.s, z4.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z23.s, z7.s, z3.s[0]\n"
+      "fmla z27.s, z7.s, z4.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "fmla z20.s, z6.s, z3.s[1]\n"
+      "fmla z24.s, z6.s, z4.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "fmla z21.s, z7.s, z3.s[1]\n"
+      "fmla z25.s, z7.s, z4.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z22.s, z6.s, z3.s[1]\n"
+      "fmla z26.s, z6.s, z4.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z23.s, z7.s, z3.s[1]\n"
+      "fmla z27.s, z7.s, z4.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "fmla z20.s, z6.s, z3.s[2]\n"
+      "fmla z24.s, z6.s, z4.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "fmla z21.s, z7.s, z3.s[2]\n"
+      "fmla z25.s, z7.s, z4.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z22.s, z6.s, z3.s[2]\n"
+      "fmla z26.s, z6.s, z4.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z23.s, z7.s, z3.s[2]\n"
+      "fmla z27.s, z7.s, z4.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "fmla z20.s, z6.s, z3.s[3]\n"
+      "fmla z24.s, z6.s, z4.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "fmla z21.s, z7.s, z3.s[3]\n"
+      "fmla z25.s, z7.s, z4.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z22.s, z6.s, z3.s[3]\n"
+      "fmla z26.s, z6.s, z4.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z27.s, z7.s, z4.s[3]\n"
+      "bgt 66b\n"
+      "67:"  // Height 5: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "fmla z20.s, z6.s, z3.s[0]\n"
+      "fmla z24.s, z6.s, z4.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z21.s, z7.s, z3.s[0]\n"
+      "fmla z25.s, z7.s, z4.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z22.s, z6.s, z3.s[0]\n"
+      "fmla z26.s, z6.s, z4.s[0]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z23.s, z7.s, z3.s[0]\n"
+      "fmla z27.s, z7.s, z4.s[0]\n"
+      "ble 68f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "fmla z20.s, z6.s, z3.s[1]\n"
+      "fmla z24.s, z6.s, z4.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "fmla z21.s, z7.s, z3.s[1]\n"
+      "fmla z25.s, z7.s, z4.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z22.s, z6.s, z3.s[1]\n"
+      "fmla z26.s, z6.s, z4.s[1]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z23.s, z7.s, z3.s[1]\n"
+      "fmla z27.s, z7.s, z4.s[1]\n"
+      "ble 68f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "fmla z20.s, z6.s, z3.s[2]\n"
+      "fmla z24.s, z6.s, z4.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "fmla z21.s, z7.s, z3.s[2]\n"
+      "fmla z25.s, z7.s, z4.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z22.s, z6.s, z3.s[2]\n"
+      "fmla z26.s, z6.s, z4.s[2]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z23.s, z7.s, z3.s[2]\n"
+      "fmla z27.s, z7.s, z4.s[2]\n"
+      "ble 68f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "fmla z20.s, z6.s, z3.s[3]\n"
+      "fmla z24.s, z6.s, z4.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "fmla z21.s, z7.s, z3.s[3]\n"
+      "fmla z25.s, z7.s, z4.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z22.s, z6.s, z3.s[3]\n"
+      "fmla z26.s, z6.s, z4.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z27.s, z7.s, z4.s[3]\n"
+      "68:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 63b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 69f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z23.s, p5/M, z23.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "fmax z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z1.s\n"
+      "fmax z26.s, p5/M, z26.s, z1.s\n"
+      "fmax z27.s, p5/M, z27.s, z1.s\n"
+      "69:"  // Height 5: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "70:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 59b\n"
+      "b 86f\n"
+      "71:"  // Height 6
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "73:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 74f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 76f\n"
+      "74:"  // Height 6: no bias
+      "tbz %x[flags], #0, 75f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x21]\n"
+      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "b 76f\n"
+      "75:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "76:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "77:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 78f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 79f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 79f\n"
+      "78:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "add x20, x22, x19, LSL #2\n"
+      "79:"  // Height 6: input setup done
+      "cmp x11, #0x4\n"
+      "ble 81f\n"
+      "80:"  // Height 6: Multiply loop: Main loop head
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z20.s, z6.s, z3.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z24.s, z6.s, z4.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x4\n"
+      "fmla z28.s, z6.s, z5.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z21.s, z7.s, z3.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z25.s, z7.s, z4.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla z29.s, z7.s, z5.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z22.s, z6.s, z3.s[0]\n"
+      "fmla z26.s, z6.s, z4.s[0]\n"
+      "fmla z30.s, z6.s, z5.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z23.s, z7.s, z3.s[0]\n"
+      "fmla z27.s, z7.s, z4.s[0]\n"
+      "fmla z31.s, z7.s, z5.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "fmla z20.s, z6.s, z3.s[1]\n"
+      "fmla z24.s, z6.s, z4.s[1]\n"
+      "fmla z28.s, z6.s, z5.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "fmla z21.s, z7.s, z3.s[1]\n"
+      "fmla z25.s, z7.s, z4.s[1]\n"
+      "fmla z29.s, z7.s, z5.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z22.s, z6.s, z3.s[1]\n"
+      "fmla z26.s, z6.s, z4.s[1]\n"
+      "fmla z30.s, z6.s, z5.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z23.s, z7.s, z3.s[1]\n"
+      "fmla z27.s, z7.s, z4.s[1]\n"
+      "fmla z31.s, z7.s, z5.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "fmla z20.s, z6.s, z3.s[2]\n"
+      "fmla z24.s, z6.s, z4.s[2]\n"
+      "fmla z28.s, z6.s, z5.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "fmla z21.s, z7.s, z3.s[2]\n"
+      "fmla z25.s, z7.s, z4.s[2]\n"
+      "fmla z29.s, z7.s, z5.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z22.s, z6.s, z3.s[2]\n"
+      "fmla z26.s, z6.s, z4.s[2]\n"
+      "fmla z30.s, z6.s, z5.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z23.s, z7.s, z3.s[2]\n"
+      "fmla z27.s, z7.s, z4.s[2]\n"
+      "fmla z31.s, z7.s, z5.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "fmla z20.s, z6.s, z3.s[3]\n"
+      "fmla z24.s, z6.s, z4.s[3]\n"
+      "fmla z28.s, z6.s, z5.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "fmla z21.s, z7.s, z3.s[3]\n"
+      "fmla z25.s, z7.s, z4.s[3]\n"
+      "fmla z29.s, z7.s, z5.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z22.s, z6.s, z3.s[3]\n"
+      "fmla z26.s, z6.s, z4.s[3]\n"
+      "fmla z30.s, z6.s, z5.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z27.s, z7.s, z4.s[3]\n"
+      "fmla z31.s, z7.s, z5.s[3]\n"
+      "bgt 80b\n"
+      "81:"  // Height 6: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z20.s, z6.s, z3.s[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "fmla z24.s, z6.s, z4.s[0]\n"
+      "fmla z28.s, z6.s, z5.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z21.s, z7.s, z3.s[0]\n"
+      "fmla z25.s, z7.s, z4.s[0]\n"
+      "fmla z29.s, z7.s, z5.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z22.s, z6.s, z3.s[0]\n"
+      "fmla z26.s, z6.s, z4.s[0]\n"
+      "fmla z30.s, z6.s, z5.s[0]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z23.s, z7.s, z3.s[0]\n"
+      "fmla z27.s, z7.s, z4.s[0]\n"
+      "fmla z31.s, z7.s, z5.s[0]\n"
+      "ble 82f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "fmla z20.s, z6.s, z3.s[1]\n"
+      "fmla z24.s, z6.s, z4.s[1]\n"
+      "fmla z28.s, z6.s, z5.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "fmla z21.s, z7.s, z3.s[1]\n"
+      "fmla z25.s, z7.s, z4.s[1]\n"
+      "fmla z29.s, z7.s, z5.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z22.s, z6.s, z3.s[1]\n"
+      "fmla z26.s, z6.s, z4.s[1]\n"
+      "fmla z30.s, z6.s, z5.s[1]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z23.s, z7.s, z3.s[1]\n"
+      "fmla z27.s, z7.s, z4.s[1]\n"
+      "fmla z31.s, z7.s, z5.s[1]\n"
+      "ble 82f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "fmla z20.s, z6.s, z3.s[2]\n"
+      "fmla z24.s, z6.s, z4.s[2]\n"
+      "fmla z28.s, z6.s, z5.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "fmla z21.s, z7.s, z3.s[2]\n"
+      "fmla z25.s, z7.s, z4.s[2]\n"
+      "fmla z29.s, z7.s, z5.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z22.s, z6.s, z3.s[2]\n"
+      "fmla z26.s, z6.s, z4.s[2]\n"
+      "fmla z30.s, z6.s, z5.s[2]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z23.s, z7.s, z3.s[2]\n"
+      "fmla z27.s, z7.s, z4.s[2]\n"
+      "fmla z31.s, z7.s, z5.s[2]\n"
+      "ble 82f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "fmla z20.s, z6.s, z3.s[3]\n"
+      "fmla z24.s, z6.s, z4.s[3]\n"
+      "fmla z28.s, z6.s, z5.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "fmla z21.s, z7.s, z3.s[3]\n"
+      "fmla z25.s, z7.s, z4.s[3]\n"
+      "fmla z29.s, z7.s, z5.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z22.s, z6.s, z3.s[3]\n"
+      "fmla z26.s, z6.s, z4.s[3]\n"
+      "fmla z30.s, z6.s, z5.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z27.s, z7.s, z4.s[3]\n"
+      "fmla z31.s, z7.s, z5.s[3]\n"
+      "82:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 77b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 83f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z23.s, p5/M, z23.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "fmax z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmin z28.s, p5/M, z28.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z1.s\n"
+      "fmax z26.s, p5/M, z26.s, z1.s\n"
+      "fmax z27.s, p5/M, z27.s, z1.s\n"
+      "fmax z28.s, p5/M, z28.s, z1.s\n"
+      "fmin z29.s, p5/M, z29.s, z0.s\n"
+      "fmin z30.s, p5/M, z30.s, z0.s\n"
+      "fmin z31.s, p5/M, z31.s, z0.s\n"
+      "fmax z29.s, p5/M, z29.s, z1.s\n"
+      "fmax z30.s, p5/M, z30.s, z1.s\n"
+      "fmax z31.s, p5/M, z31.s, z1.s\n"
+      "83:"  // Height 6: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "st1w { z28.s }, p4, [x21]\n"
+      "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+      "addvl x21, x21, #4\n"
+      "84:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 73b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 86f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "85:"  // Update direct input
+      "mov x19, #0x18\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "86:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
index fd416ed2f4..20d9922e93 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,37 +10,43 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __ARM_FEATURE_SVE
 
-
 #include "../std_transforms_sve.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<float>, \
+   size_t, size_t, \
+   const float *, \
+   IndirectOutputArg<float>, \
+   const float *, Activation, bool
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_hybrid_fp32_mmla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_hybrid_fp32_mla_8x1VL( ARGLIST );
 
-class hybrid_fp32_mmla_4VLx4
+class cls_sve_hybrid_fp32_mla_8x1VL
 {
 public:
     typedef float operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
@@ -50,12 +56,12 @@ public:
 
     static unsigned int out_width()
     {
-        return get_vector_length<float>() * 2;
+        return get_vector_length<float>() * 1;
     }
 
     static constexpr unsigned int k_unroll()
     {
-        return 2;
+        return 1;
     }
 
     static constexpr bool supports_accumulate()
@@ -63,27 +69,17 @@ public:
         return true;
     }
 
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 4, 4, 2> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 1, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_hybrid_fp32_mmla_4VLx4;
+    kern_type kernel=sve_hybrid_fp32_mla_8x1VL;
 
-    hybrid_fp32_mmla_4VLx4(const CPUInfo *)
+    cls_sve_hybrid_fp32_mla_8x1VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
new file mode 100644
index 0000000000..361e303c7a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -0,0 +1,1751 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_8x1VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x8\n"
+      "bge 99f\n"
+      "cmp %x[M], #0x6\n"
+      "bgt 85f\n"
+      "beq 71f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 57f\n"
+      "beq 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 4f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "addvl x8, x8, #1\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z24.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x16, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "cbnz x16, 9f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "cmp x15, #0x4\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 12f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "addvl x7, x7, #1\n"
+      "ble 12f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "addvl x7, x7, #1\n"
+      "ble 12f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "12:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 7b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "tbz %x[flags], #1, 13f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "13:"  // Height 1: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "14:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 3b\n"
+      "b 114f\n"
+      "15:"  // Height 2
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 16f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "17:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 18f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "b 20f\n"
+      "18:"  // Height 2: no bias
+      "tbz %x[flags], #0, 19f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "b 20f\n"
+      "19:"  // Height 2: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "20:"  // Height 2: setup done
+      "mov x16, #0x0\n"
+      "21:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 22f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "cbnz x16, 23f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "b 23f\n"
+      "22:"  // Height 2: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "23:"  // Height 2: input setup done
+      "cmp x15, #0x4\n"
+      "ble 25f\n"
+      "24:"  // Height 2: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "bgt 24b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 26f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "ble 26f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "ble 26f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 21b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "27:"  // Height 2: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "28:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 17b\n"
+      "b 114f\n"
+      "29:"  // Height 3
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 30f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "add x11, x11, x19, LSL #2\n"
+      "b 31f\n"
+      "30:"  // Height 3: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "31:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 32f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "mov z26.d, z24.d\n"
+      "b 34f\n"
+      "32:"  // Height 3: no bias
+      "tbz %x[flags], #0, 33f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "ld1w { z26.s }, p1/Z, [x11]\n"
+      "b 34f\n"
+      "33:"  // Height 3: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x16, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "cbnz x16, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "37:"  // Height 3: input setup done
+      "cmp x15, #0x4\n"
+      "ble 39f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z26.s, z9.s, z2.s[1]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z26.s, z10.s, z2.s[2]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "fmla z26.s, z11.s, z2.s[3]\n"
+      "bgt 38b\n"
+      "39:"  // Height 3: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z26.s, z12.s, z2.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 40f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z13.s, z2.s[1]\n"
+      "ble 40f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z14.s, z2.s[2]\n"
+      "ble 40f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "fmla z26.s, z15.s, z2.s[3]\n"
+      "40:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 35b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "tbz %x[flags], #1, 41f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "41:"  // Height 3: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "st1w { z26.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "42:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 31b\n"
+      "b 114f\n"
+      "43:"  // Height 4
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 44f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 45f\n"
+      "44:"  // Height 4: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "45:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 46f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "mov z26.d, z24.d\n"
+      "mov z27.d, z24.d\n"
+      "b 48f\n"
+      "46:"  // Height 4: no bias
+      "tbz %x[flags], #0, 47f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "ld1w { z26.s }, p1/Z, [x11]\n"
+      "ld1w { z27.s }, p1/Z, [x9]\n"
+      "b 48f\n"
+      "47:"  // Height 4: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "48:"  // Height 4: setup done
+      "mov x16, #0x0\n"
+      "49:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "cbnz x16, 51f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "b 51f\n"
+      "50:"  // Height 4: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "51:"  // Height 4: input setup done
+      "cmp x15, #0x4\n"
+      "ble 53f\n"
+      "52:"  // Height 4: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z26.s, z9.s, z2.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z27.s, z9.s, z3.s[1]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z26.s, z10.s, z2.s[2]\n"
+      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z27.s, z11.s, z3.s[3]\n"
+      "bgt 52b\n"
+      "53:"  // Height 4: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z26.s, z12.s, z2.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z27.s, z12.s, z3.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 54f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z13.s, z2.s[1]\n"
+      "fmla z27.s, z13.s, z3.s[1]\n"
+      "ble 54f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z14.s, z2.s[2]\n"
+      "fmla z27.s, z14.s, z3.s[2]\n"
+      "ble 54f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "fmla z26.s, z15.s, z2.s[3]\n"
+      "fmla z27.s, z15.s, z3.s[3]\n"
+      "54:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 49b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 55f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "55:"  // Height 4: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "st1w { z26.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z27.s }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "56:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 45b\n"
+      "b 114f\n"
+      "57:"  // Height 5
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 58f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 59f\n"
+      "58:"  // Height 5: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "59:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 60f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "mov z26.d, z24.d\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "b 62f\n"
+      "60:"  // Height 5: no bias
+      "tbz %x[flags], #0, 61f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "ld1w { z26.s }, p1/Z, [x11]\n"
+      "ld1w { z27.s }, p1/Z, [x9]\n"
+      "ld1w { z28.s }, p1/Z, [x27]\n"
+      "b 62f\n"
+      "61:"  // Height 5: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "62:"  // Height 5: setup done
+      "mov x16, #0x0\n"
+      "63:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 64f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "cbnz x16, 65f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "b 65f\n"
+      "64:"  // Height 5: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "65:"  // Height 5: input setup done
+      "cmp x15, #0x4\n"
+      "ble 67f\n"
+      "66:"  // Height 5: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z28.s, z8.s, z4.s[0]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z26.s, z9.s, z2.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z27.s, z9.s, z3.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z28.s, z9.s, z4.s[1]\n"
+      "fmla z26.s, z10.s, z2.s[2]\n"
+      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z28.s, z10.s, z4.s[2]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z27.s, z11.s, z3.s[3]\n"
+      "fmla z28.s, z11.s, z4.s[3]\n"
+      "bgt 66b\n"
+      "67:"  // Height 5: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z26.s, z12.s, z2.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z27.s, z12.s, z3.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z28.s, z12.s, z4.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 68f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z13.s, z2.s[1]\n"
+      "fmla z27.s, z13.s, z3.s[1]\n"
+      "fmla z28.s, z13.s, z4.s[1]\n"
+      "ble 68f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z14.s, z2.s[2]\n"
+      "fmla z27.s, z14.s, z3.s[2]\n"
+      "fmla z28.s, z14.s, z4.s[2]\n"
+      "ble 68f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "fmla z26.s, z15.s, z2.s[3]\n"
+      "fmla z27.s, z15.s, z3.s[3]\n"
+      "fmla z28.s, z15.s, z4.s[3]\n"
+      "68:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 63b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 69f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "69:"  // Height 5: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "st1w { z26.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z27.s }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "st1w { z28.s }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "70:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 59b\n"
+      "b 114f\n"
+      "71:"  // Height 6
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x28]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "73:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 74f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "mov z26.d, z24.d\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "mov z29.d, z24.d\n"
+      "b 76f\n"
+      "74:"  // Height 6: no bias
+      "tbz %x[flags], #0, 75f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "ld1w { z26.s }, p1/Z, [x11]\n"
+      "ld1w { z27.s }, p1/Z, [x9]\n"
+      "ld1w { z28.s }, p1/Z, [x27]\n"
+      "ld1w { z29.s }, p1/Z, [x25]\n"
+      "b 76f\n"
+      "75:"  // Height 6: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "76:"  // Height 6: setup done
+      "mov x16, #0x0\n"
+      "77:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 78f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "ldr x24, [x20, #0x28]\n"
+      "cbnz x16, 79f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "b 79f\n"
+      "78:"  // Height 6: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "79:"  // Height 6: input setup done
+      "cmp x15, #0x4\n"
+      "ble 81f\n"
+      "80:"  // Height 6: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z28.s, z8.s, z4.s[0]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z29.s, z8.s, z5.s[0]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z26.s, z9.s, z2.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla z27.s, z9.s, z3.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z28.s, z9.s, z4.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z29.s, z9.s, z5.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z26.s, z10.s, z2.s[2]\n"
+      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z28.s, z10.s, z4.s[2]\n"
+      "fmla z29.s, z10.s, z5.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z27.s, z11.s, z3.s[3]\n"
+      "fmla z28.s, z11.s, z4.s[3]\n"
+      "fmla z29.s, z11.s, z5.s[3]\n"
+      "bgt 80b\n"
+      "81:"  // Height 6: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z26.s, z12.s, z2.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z27.s, z12.s, z3.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z28.s, z12.s, z4.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z29.s, z12.s, z5.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 82f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z13.s, z2.s[1]\n"
+      "fmla z27.s, z13.s, z3.s[1]\n"
+      "fmla z28.s, z13.s, z4.s[1]\n"
+      "fmla z29.s, z13.s, z5.s[1]\n"
+      "ble 82f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z14.s, z2.s[2]\n"
+      "fmla z27.s, z14.s, z3.s[2]\n"
+      "fmla z28.s, z14.s, z4.s[2]\n"
+      "fmla z29.s, z14.s, z5.s[2]\n"
+      "ble 82f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "fmla z26.s, z15.s, z2.s[3]\n"
+      "fmla z27.s, z15.s, z3.s[3]\n"
+      "fmla z28.s, z15.s, z4.s[3]\n"
+      "fmla z29.s, z15.s, z5.s[3]\n"
+      "82:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 77b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 83f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z17.s\n"
+      "83:"  // Height 6: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "st1w { z26.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z27.s }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "st1w { z28.s }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "st1w { z29.s }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "84:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 73b\n"
+      "b 114f\n"
+      "85:"  // Height 7
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 86f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x28]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x23, [%x[output_ptr], #0x30]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 87f\n"
+      "86:"  // Height 7: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "87:"  // Height 7: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 88f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "mov z26.d, z24.d\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "mov z29.d, z24.d\n"
+      "mov z30.d, z24.d\n"
+      "b 90f\n"
+      "88:"  // Height 7: no bias
+      "tbz %x[flags], #0, 89f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "ld1w { z26.s }, p1/Z, [x11]\n"
+      "ld1w { z27.s }, p1/Z, [x9]\n"
+      "ld1w { z28.s }, p1/Z, [x27]\n"
+      "ld1w { z29.s }, p1/Z, [x25]\n"
+      "ld1w { z30.s }, p1/Z, [x23]\n"
+      "b 90f\n"
+      "89:"  // Height 7: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "90:"  // Height 7: setup done
+      "mov x16, #0x0\n"
+      "91:"  // Height 7: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 92f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "ldr x24, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "cbnz x16, 93f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 93f\n"
+      "92:"  // Height 7: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "93:"  // Height 7: input setup done
+      "cmp x15, #0x4\n"
+      "ble 95f\n"
+      "94:"  // Height 7: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1rqw { z6.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z28.s, z8.s, z4.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z29.s, z8.s, z5.s[0]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z30.s, z8.s, z6.s[0]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z26.s, z9.s, z2.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla z27.s, z9.s, z3.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z28.s, z9.s, z4.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z29.s, z9.s, z5.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z30.s, z9.s, z6.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z26.s, z10.s, z2.s[2]\n"
+      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z28.s, z10.s, z4.s[2]\n"
+      "fmla z29.s, z10.s, z5.s[2]\n"
+      "fmla z30.s, z10.s, z6.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z27.s, z11.s, z3.s[3]\n"
+      "fmla z28.s, z11.s, z4.s[3]\n"
+      "fmla z29.s, z11.s, z5.s[3]\n"
+      "fmla z30.s, z11.s, z6.s[3]\n"
+      "bgt 94b\n"
+      "95:"  // Height 7: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z26.s, z12.s, z2.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z27.s, z12.s, z3.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z28.s, z12.s, z4.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z29.s, z12.s, z5.s[0]\n"
+      "ld1rqw { z6.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z30.s, z12.s, z6.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 96f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z13.s, z2.s[1]\n"
+      "fmla z27.s, z13.s, z3.s[1]\n"
+      "fmla z28.s, z13.s, z4.s[1]\n"
+      "fmla z29.s, z13.s, z5.s[1]\n"
+      "fmla z30.s, z13.s, z6.s[1]\n"
+      "ble 96f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z14.s, z2.s[2]\n"
+      "fmla z27.s, z14.s, z3.s[2]\n"
+      "fmla z28.s, z14.s, z4.s[2]\n"
+      "fmla z29.s, z14.s, z5.s[2]\n"
+      "fmla z30.s, z14.s, z6.s[2]\n"
+      "ble 96f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "fmla z26.s, z15.s, z2.s[3]\n"
+      "fmla z27.s, z15.s, z3.s[3]\n"
+      "fmla z28.s, z15.s, z4.s[3]\n"
+      "fmla z29.s, z15.s, z5.s[3]\n"
+      "fmla z30.s, z15.s, z6.s[3]\n"
+      "96:"  // Height 7: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 91b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 97f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z16.s\n"
+      "fmin z30.s, p2/M, z30.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z17.s\n"
+      "fmax z30.s, p2/M, z30.s, z17.s\n"
+      "97:"  // Height 7: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "st1w { z26.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z27.s }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "st1w { z28.s }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "st1w { z29.s }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "st1w { z30.s }, p1, [x23]\n"
+      "addvl x23, x23, #1\n"
+      "98:"  // Height 7: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 87b\n"
+      "b 114f\n"
+      "99:"  // Height 8
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 100f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x28]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x23, [%x[output_ptr], #0x30]\n"
+      "ldr x21, [%x[output_ptr], #0x38]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add %x[output_ptr], %x[output_ptr], #0x40\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 101f\n"
+      "100:"  // Height 8: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "101:"  // Height 8: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 102f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "mov z26.d, z24.d\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "mov z29.d, z24.d\n"
+      "mov z30.d, z24.d\n"
+      "mov z31.d, z24.d\n"
+      "b 104f\n"
+      "102:"  // Height 8: no bias
+      "tbz %x[flags], #0, 103f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "ld1w { z26.s }, p1/Z, [x11]\n"
+      "ld1w { z27.s }, p1/Z, [x9]\n"
+      "ld1w { z28.s }, p1/Z, [x27]\n"
+      "ld1w { z29.s }, p1/Z, [x25]\n"
+      "ld1w { z30.s }, p1/Z, [x23]\n"
+      "ld1w { z31.s }, p1/Z, [x21]\n"
+      "b 104f\n"
+      "103:"  // Height 8: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "104:"  // Height 8: setup done
+      "mov x16, #0x0\n"
+      "105:"  // Height 8: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 106f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "ldr x24, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "ldr x20, [x20, #0x38]\n"
+      "cbnz x16, 107f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 107f\n"
+      "106:"  // Height 8: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "add x20, x22, x19, LSL #2\n"
+      "107:"  // Height 8: input setup done
+      "cmp x15, #0x4\n"
+      "ble 109f\n"
+      "108:"  // Height 8: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1rqw { z6.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z28.s, z8.s, z4.s[0]\n"
+      "ld1rqw { z7.s }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z29.s, z8.s, z5.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z30.s, z8.s, z6.s[0]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z31.s, z8.s, z7.s[0]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z26.s, z9.s, z2.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla z27.s, z9.s, z3.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z28.s, z9.s, z4.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z29.s, z9.s, z5.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z30.s, z9.s, z6.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z31.s, z9.s, z7.s[1]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z26.s, z10.s, z2.s[2]\n"
+      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z28.s, z10.s, z4.s[2]\n"
+      "fmla z29.s, z10.s, z5.s[2]\n"
+      "fmla z30.s, z10.s, z6.s[2]\n"
+      "fmla z31.s, z10.s, z7.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z27.s, z11.s, z3.s[3]\n"
+      "fmla z28.s, z11.s, z4.s[3]\n"
+      "fmla z29.s, z11.s, z5.s[3]\n"
+      "fmla z30.s, z11.s, z6.s[3]\n"
+      "fmla z31.s, z11.s, z7.s[3]\n"
+      "bgt 108b\n"
+      "109:"  // Height 8: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z26.s, z12.s, z2.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z27.s, z12.s, z3.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z28.s, z12.s, z4.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z29.s, z12.s, z5.s[0]\n"
+      "ld1rqw { z6.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z30.s, z12.s, z6.s[0]\n"
+      "ld1rqw { z7.s }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z31.s, z12.s, z7.s[0]\n"
+      "add x20, x20, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 110f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z13.s, z2.s[1]\n"
+      "fmla z27.s, z13.s, z3.s[1]\n"
+      "fmla z28.s, z13.s, z4.s[1]\n"
+      "fmla z29.s, z13.s, z5.s[1]\n"
+      "fmla z30.s, z13.s, z6.s[1]\n"
+      "fmla z31.s, z13.s, z7.s[1]\n"
+      "ble 110f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z14.s, z2.s[2]\n"
+      "fmla z27.s, z14.s, z3.s[2]\n"
+      "fmla z28.s, z14.s, z4.s[2]\n"
+      "fmla z29.s, z14.s, z5.s[2]\n"
+      "fmla z30.s, z14.s, z6.s[2]\n"
+      "fmla z31.s, z14.s, z7.s[2]\n"
+      "ble 110f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "fmla z26.s, z15.s, z2.s[3]\n"
+      "fmla z27.s, z15.s, z3.s[3]\n"
+      "fmla z28.s, z15.s, z4.s[3]\n"
+      "fmla z29.s, z15.s, z5.s[3]\n"
+      "fmla z30.s, z15.s, z6.s[3]\n"
+      "fmla z31.s, z15.s, z7.s[3]\n"
+      "110:"  // Height 8: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 105b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 111f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z16.s\n"
+      "fmin z30.s, p2/M, z30.s, z16.s\n"
+      "fmin z31.s, p2/M, z31.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z17.s\n"
+      "fmax z30.s, p2/M, z30.s, z17.s\n"
+      "fmax z31.s, p2/M, z31.s, z17.s\n"
+      "111:"  // Height 8: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "st1w { z26.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z27.s }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "st1w { z28.s }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "st1w { z29.s }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "st1w { z30.s }, p1, [x23]\n"
+      "addvl x23, x23, #1\n"
+      "st1w { z31.s }, p1, [x21]\n"
+      "addvl x21, x21, #1\n"
+      "112:"  // Height 8: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 101b\n"
+      "subs %x[M], %x[M], #0x8\n"
+      "beq 114f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 113f\n"
+      "add x20, x20, #0x8\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "113:"  // Update direct input
+      "mov x19, #0x20\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "114:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp
deleted file mode 100644
index 1364585604..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,3459 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_fp32_mmla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = ((K + 1) / 2) * 2;
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long leftovers = K;
-    const long blocks_count = (K + 1) / 2;
-    float nullbias[128];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (2 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 8) {
-            if (rows_to_compute % 8) {
-                rows_to_compute = 8 - 1;
-            } else {
-                rows_to_compute = 8;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z1.s, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z14.s, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "mov z1.s, #0\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z5.s, #0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "mov z1.s, #0\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z5.s, #0\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "mov z1.s, #0\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z5.s, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp1 z1.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z3.s, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z20.d, z16.d\n"
-                        "mov z21.d, z17.d\n"
-                        "mov z22.d, z18.d\n"
-                        "mov z23.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z3.s, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z7.s, #0\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z3.s, #0\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z7.s, #0\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z3.s, #0\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z7.s, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp1 z5.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z21.d, z17.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z22.d, z18.d\n"
-                        "mov z23.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-                case 5:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "c_ptr1 .req X4\n"
-                        "c_ptr2 .req X5\n"
-                        "c_ptr3 .req X6\n"
-                        "c_ptr4 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z5.s, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z21.d, z17.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z23.d, z19.d\n"
-                        "mov z24.d, z16.d\n"
-                        "mov z25.d, z17.d\n"
-                        "mov z26.d, z18.d\n"
-                        "mov z27.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z5.s, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z9.s, #0\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z5.s, #0\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z9.s, #0\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z5.s, #0\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p6/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z9.s, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp1 z9.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
-                    );
-                    break;
-                case 6:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "c_ptr1 .req X5\n"
-                        "c_ptr2 .req X6\n"
-                        "c_ptr3 .req X7\n"
-                        "c_ptr4 .req X8\n"
-                        "c_ptr5 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z21.d, z17.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "mov z24.d, z16.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z25.d, z17.d\n"
-                        "mov z26.d, z18.d\n"
-                        "mov z27.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "ld1w z14.s, p0/z, [c_ptr5]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl a_ptr5, a_ptr5, #2\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p6/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p6/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "addvl a_ptr5, a_ptr5, #1\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "st1w z9.s, p0, [c_ptr5]\n"
-                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
-                    );
-                    break;
-                case 7:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "a_ptr6 .req X5\n"
-                        "c_ptr1 .req X6\n"
-                        "c_ptr2 .req X7\n"
-                        "c_ptr3 .req X8\n"
-                        "c_ptr4 .req X9\n"
-                        "c_ptr5 .req X10\n"
-                        "c_ptr6 .req X11\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add a_ptr6, a_ptr5, %[lda]\n"
-                        "add c_ptr6, c_ptr5, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z7.s, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z21.d, z17.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "mov z24.d, z16.d\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "mov z25.d, z17.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z26.d, z18.d\n"
-                        "mov z27.d, z19.d\n"
-                        "mov z28.d, z16.d\n"
-                        "mov z29.d, z17.d\n"
-                        "mov z30.d, z18.d\n"
-                        "mov z31.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z7.s, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "ld1w z14.s, p0/z, [c_ptr5]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr6]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z28.s, z13.s, z14.s\n"
-                        "zip2 z29.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z30.s, z13.s, z14.s\n"
-                        "zip2 z31.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1rqw z10.s, p7/z, [a_ptr6]\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z11.s, #0\n"
-                        "add a_ptr6, a_ptr6, #0x20\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z7.s, #0\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1rqw z10.s, p7/z, [a_ptr6]\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z11.s, #0\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl a_ptr5, a_ptr5, #2\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "addvl a_ptr6, a_ptr6, #2\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z7.s, #0\n"
-                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p6/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p6/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "addvl a_ptr5, a_ptr5, #1\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1rqw z10.s, p6/z, [a_ptr6]\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z11.s, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "addvl a_ptr6, a_ptr6, #1\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "st1w z9.s, p0, [c_ptr5]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "uzp1 z12.s, z28.s, z29.s\n"
-                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
-                        "uzp1 z13.s, z30.s, z31.s\n"
-                        "st1w z12.s, p0, [c_ptr6]\n"
-                        "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq a_ptr6\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        ".unreq c_ptr6\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 8:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "a_ptr6 .req X5\n"
-                        "a_ptr7 .req X6\n"
-                        "c_ptr1 .req X7\n"
-                        "c_ptr2 .req X8\n"
-                        "c_ptr3 .req X9\n"
-                        "c_ptr4 .req X10\n"
-                        "c_ptr5 .req X11\n"
-                        "c_ptr6 .req X12\n"
-                        "c_ptr7 .req X13\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add a_ptr6, a_ptr5, %[lda]\n"
-                        "add c_ptr6, c_ptr5, %[ldc]\n"
-                        "add a_ptr7, a_ptr6, %[lda]\n"
-                        "add c_ptr7, c_ptr6, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z24.d, z16.d\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "mov z25.d, z17.d\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "mov z26.d, z18.d\n"
-                        "add a_ptr7, a_ptr7, #0x10\n"
-                        "mov z27.d, z19.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z28.d, z16.d\n"
-                        "mov z29.d, z17.d\n"
-                        "mov z30.d, z18.d\n"
-                        "mov z31.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "add a_ptr7, a_ptr7, #0x10\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "ld1w z14.s, p0/z, [c_ptr5]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr6]\n"
-                        "ld1w z14.s, p0/z, [c_ptr7]\n"
-                        "zip1 z28.s, z13.s, z14.s\n"
-                        "zip2 z29.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n"
-                        "zip1 z30.s, z13.s, z14.s\n"
-                        "zip2 z31.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1rqw z10.s, p7/z, [a_ptr6]\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z11.s, p7/z, [a_ptr7]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "add a_ptr6, a_ptr6, #0x20\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "add a_ptr7, a_ptr7, #0x20\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr7, #-0x10]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1rqw z10.s, p7/z, [a_ptr6]\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z11.s, p7/z, [a_ptr7]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "addvl a_ptr5, a_ptr5, #2\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "addvl a_ptr6, a_ptr6, #2\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "addvl a_ptr7, a_ptr7, #2\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p6/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p6/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "addvl a_ptr5, a_ptr5, #1\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1rqw z10.s, p6/z, [a_ptr6]\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z11.s, p6/z, [a_ptr7]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl a_ptr6, a_ptr6, #1\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "addvl a_ptr7, a_ptr7, #1\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "st1w z9.s, p0, [c_ptr5]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "uzp1 z12.s, z28.s, z29.s\n"
-                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
-                        "uzp2 z13.s, z28.s, z29.s\n"
-                        "uzp1 z14.s, z30.s, z31.s\n"
-                        "uzp2 z15.s, z30.s, z31.s\n"
-                        "st1w z12.s, p0, [c_ptr6]\n"
-                        "st1w z13.s, p0, [c_ptr7]\n"
-                        "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n"
-                        "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq a_ptr6\n"
-                        ".unreq a_ptr7\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        ".unreq c_ptr6\n"
-                        ".unreq c_ptr7\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
index c500f43fe0..0150ce8fd9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,37 +10,43 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __ARM_FEATURE_SVE
 
-#include <cstdint>
 #include "../std_transforms_sve.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<int8_t>, \
+   size_t, size_t, \
+   const int8_t *, \
+   IndirectOutputArg<int8_t>, \
+   const Requantize32 *, const int32_t *, unsigned int
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_hybrid_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void sve_hybrid_s8qa_dot_4x4VL( ARGLIST );
 
-class hybrid_s8s32_dot_4VLx4
+class cls_sve_hybrid_s8qa_dot_4x4VL
 {
 public:
     typedef int8_t operand_type;
-    typedef int32_t result_type;
+    typedef int8_t result_type;
 
-    typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
@@ -60,30 +66,20 @@ public:
 
     static constexpr bool supports_accumulate()
     {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_activation()
-    {
         return false;
     }
 
     StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_hybrid_s8s32_dot_4VLx4;
+    kern_type kernel=sve_hybrid_s8qa_dot_4x4VL;
 
-    hybrid_s8s32_dot_4VLx4(const CPUInfo *)
+    cls_sve_hybrid_s8qa_dot_4x4VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
new file mode 100644
index 0000000000..2b1448bd65
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
@@ -0,0 +1,1602 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8qa_dot_4x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 46f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 31f\n"
+      "beq 16f\n"
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "add x9, x9, x19\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "ble 10f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z4.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z5.b, z0.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "sdot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "sdot z19.s, z7.b, z0.b[0]\n"
+      "sdot z16.s, z8.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "sdot z17.s, z9.b, z0.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "sdot z18.s, z10.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "sdot z19.s, z4.b, z0.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "sdot z16.s, z5.b, z0.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "sdot z17.s, z6.b, z0.b[2]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "sdot z18.s, z7.b, z0.b[2]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "sdot z19.s, z8.b, z0.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "sdot z16.s, z9.b, z0.b[3]\n"
+      "sdot z17.s, z10.b, z0.b[3]\n"
+      "sdot z18.s, z4.b, z0.b[3]\n"
+      "sdot z19.s, z5.b, z0.b[3]\n"
+      "tbnz %x[flags], #31, 9f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "9:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x10\n"
+      "bgt 8b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z6.b, z0.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z7.b, z0.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z18.s, z8.b, z0.b[0]\n"
+      "sdot z19.s, z9.b, z0.b[0]\n"
+      "ble 11f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z17.s, z4.b, z0.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z18.s, z5.b, z0.b[1]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z19.s, z6.b, z0.b[1]\n"
+      "ble 11f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z17.s, z8.b, z0.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z18.s, z9.b, z0.b[2]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z19.s, z10.b, z0.b[2]\n"
+      "ble 11f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z17.s, z5.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z18.s, z6.b, z0.b[3]\n"
+      "sdot z19.s, z7.b, z0.b[3]\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 12f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "12:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 5b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbnz %x[flags], #31, 13f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z1.s }, p2/Z, [x19]\n"
+      "neg z1.s, p2/M, z1.s\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x19\n"
+      "saddv d11, p0, z11.s\n"
+      "mov z11.s, z11.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z1.s\n"
+      "13:"  // Height 1: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z17.s, z17.s, z1.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      "tbz %x[flags], #5, 14f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "14:"  // Height 1: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "15:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 3b\n"
+      "b 62f\n"
+      "16:"  // Height 2
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 17f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "add x25, x25, x19\n"
+      "b 18f\n"
+      "17:"  // Height 2: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "18:"  // Height 2: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "19:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "20:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 21f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x28, 22f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 22f\n"
+      "21:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "22:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "ble 25f\n"
+      "23:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z4.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z5.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z20.s, z4.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "sdot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "sdot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "sdot z19.s, z7.b, z0.b[0]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "sdot z23.s, z7.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "sdot z16.s, z8.b, z0.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "sdot z20.s, z8.b, z1.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "sdot z17.s, z9.b, z0.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "sdot z21.s, z9.b, z1.b[1]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "sdot z18.s, z10.b, z0.b[1]\n"
+      "sdot z22.s, z10.b, z1.b[1]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "sdot z19.s, z4.b, z0.b[1]\n"
+      "sdot z23.s, z4.b, z1.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "sdot z16.s, z5.b, z0.b[2]\n"
+      "sdot z20.s, z5.b, z1.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "sdot z17.s, z6.b, z0.b[2]\n"
+      "sdot z21.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z7.b, z0.b[2]\n"
+      "sdot z22.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z8.b, z0.b[2]\n"
+      "sdot z23.s, z8.b, z1.b[2]\n"
+      "sdot z16.s, z9.b, z0.b[3]\n"
+      "sdot z20.s, z9.b, z1.b[3]\n"
+      "sdot z17.s, z10.b, z0.b[3]\n"
+      "sdot z21.s, z10.b, z1.b[3]\n"
+      "sdot z18.s, z4.b, z0.b[3]\n"
+      "sdot z22.s, z4.b, z1.b[3]\n"
+      "sdot z19.s, z5.b, z0.b[3]\n"
+      "sdot z23.s, z5.b, z1.b[3]\n"
+      "tbnz %x[flags], #31, 24f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z12.s, z1.b, z15.b\n"
+      "24:"  // Height 2: Multiply loop: unique 3: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x10\n"
+      "bgt 23b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z7.b, z0.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z20.s, z6.b, z1.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z21.s, z7.b, z1.b[0]\n"
+      "sdot z18.s, z8.b, z0.b[0]\n"
+      "sdot z22.s, z8.b, z1.b[0]\n"
+      "sdot z19.s, z9.b, z0.b[0]\n"
+      "sdot z23.s, z9.b, z1.b[0]\n"
+      "ble 26f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z20.s, z10.b, z1.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z17.s, z4.b, z0.b[1]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z21.s, z4.b, z1.b[1]\n"
+      "sdot z18.s, z5.b, z0.b[1]\n"
+      "sdot z22.s, z5.b, z1.b[1]\n"
+      "sdot z19.s, z6.b, z0.b[1]\n"
+      "sdot z23.s, z6.b, z1.b[1]\n"
+      "ble 26f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z20.s, z7.b, z1.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z17.s, z8.b, z0.b[2]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z21.s, z8.b, z1.b[2]\n"
+      "sdot z18.s, z9.b, z0.b[2]\n"
+      "sdot z22.s, z9.b, z1.b[2]\n"
+      "sdot z19.s, z10.b, z0.b[2]\n"
+      "sdot z23.s, z10.b, z1.b[2]\n"
+      "ble 26f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "sdot z20.s, z4.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z17.s, z5.b, z0.b[3]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z21.s, z5.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z0.b[3]\n"
+      "sdot z22.s, z6.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z0.b[3]\n"
+      "sdot z23.s, z7.b, z1.b[3]\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 27f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z12.s, z1.b, z15.b\n"
+      "27:"  // Height 2: Multiply loop: unique 4: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 20b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbnz %x[flags], #31, 28f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z2.s }, p2/Z, [x19]\n"
+      "neg z2.s, p2/M, z2.s\n"
+      "mov x20, #0x4\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "saddv d11, p0, z11.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "saddv d12, p0, z12.s\n"
+      "mov z11.s, z11.s[0]\n"
+      "mov z12.s, z12.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z2.s\n"
+      "mul z12.s, p2/M, z12.s, z2.s\n"
+      "28:"  // Height 2: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z20.s, z20.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z21.s, z21.s, z12.s\n"
+      "add z22.s, z22.s, z12.s\n"
+      "add z23.s, z23.s, z12.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      "tbz %x[flags], #5, 29f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "and z8.d, z20.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "and z9.d, z21.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z20.s, z20.s, z8.s\n"
+      "sqadd z21.s, z21.s, z9.s\n"
+      "sqadd z22.s, z22.s, z10.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "29:"  // Height 2: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "30:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 18b\n"
+      "b 62f\n"
+      "31:"  // Height 3
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 32f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "b 33f\n"
+      "32:"  // Height 3: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "33:"  // Height 3: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x28, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "37:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "ble 40f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z4.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z5.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z20.s, z4.b, z1.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z24.s, z4.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "sdot z25.s, z5.b, z2.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "sdot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "sdot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "sdot z26.s, z6.b, z2.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "sdot z19.s, z7.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "sdot z23.s, z7.b, z1.b[0]\n"
+      "sdot z27.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "sdot z16.s, z8.b, z0.b[1]\n"
+      "sdot z20.s, z8.b, z1.b[1]\n"
+      "sdot z24.s, z8.b, z2.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "sdot z17.s, z9.b, z0.b[1]\n"
+      "sdot z21.s, z9.b, z1.b[1]\n"
+      "sdot z25.s, z9.b, z2.b[1]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "sdot z18.s, z10.b, z0.b[1]\n"
+      "sdot z22.s, z10.b, z1.b[1]\n"
+      "sdot z26.s, z10.b, z2.b[1]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "sdot z19.s, z4.b, z0.b[1]\n"
+      "sdot z23.s, z4.b, z1.b[1]\n"
+      "sdot z27.s, z4.b, z2.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "sdot z16.s, z5.b, z0.b[2]\n"
+      "sdot z20.s, z5.b, z1.b[2]\n"
+      "sdot z24.s, z5.b, z2.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "sdot z17.s, z6.b, z0.b[2]\n"
+      "sdot z21.s, z6.b, z1.b[2]\n"
+      "sdot z25.s, z6.b, z2.b[2]\n"
+      "sdot z18.s, z7.b, z0.b[2]\n"
+      "sdot z22.s, z7.b, z1.b[2]\n"
+      "sdot z26.s, z7.b, z2.b[2]\n"
+      "sdot z19.s, z8.b, z0.b[2]\n"
+      "sdot z23.s, z8.b, z1.b[2]\n"
+      "sdot z27.s, z8.b, z2.b[2]\n"
+      "sdot z16.s, z9.b, z0.b[3]\n"
+      "sdot z20.s, z9.b, z1.b[3]\n"
+      "sdot z24.s, z9.b, z2.b[3]\n"
+      "sdot z17.s, z10.b, z0.b[3]\n"
+      "sdot z21.s, z10.b, z1.b[3]\n"
+      "sdot z25.s, z10.b, z2.b[3]\n"
+      "sdot z18.s, z4.b, z0.b[3]\n"
+      "sdot z22.s, z4.b, z1.b[3]\n"
+      "sdot z26.s, z4.b, z2.b[3]\n"
+      "sdot z19.s, z5.b, z0.b[3]\n"
+      "sdot z23.s, z5.b, z1.b[3]\n"
+      "sdot z27.s, z5.b, z2.b[3]\n"
+      "tbnz %x[flags], #31, 39f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z12.s, z1.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "39:"  // Height 3: Multiply loop: unique 5: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "bgt 38b\n"
+      "40:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z20.s, z6.b, z1.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z24.s, z6.b, z2.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z21.s, z7.b, z1.b[0]\n"
+      "sdot z25.s, z7.b, z2.b[0]\n"
+      "sdot z18.s, z8.b, z0.b[0]\n"
+      "sdot z22.s, z8.b, z1.b[0]\n"
+      "sdot z26.s, z8.b, z2.b[0]\n"
+      "sdot z19.s, z9.b, z0.b[0]\n"
+      "sdot z23.s, z9.b, z1.b[0]\n"
+      "sdot z27.s, z9.b, z2.b[0]\n"
+      "ble 41f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z20.s, z10.b, z1.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z24.s, z10.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z17.s, z4.b, z0.b[1]\n"
+      "sdot z21.s, z4.b, z1.b[1]\n"
+      "sdot z25.s, z4.b, z2.b[1]\n"
+      "sdot z18.s, z5.b, z0.b[1]\n"
+      "sdot z22.s, z5.b, z1.b[1]\n"
+      "sdot z26.s, z5.b, z2.b[1]\n"
+      "sdot z19.s, z6.b, z0.b[1]\n"
+      "sdot z23.s, z6.b, z1.b[1]\n"
+      "sdot z27.s, z6.b, z2.b[1]\n"
+      "ble 41f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z20.s, z7.b, z1.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z24.s, z7.b, z2.b[2]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z17.s, z8.b, z0.b[2]\n"
+      "sdot z21.s, z8.b, z1.b[2]\n"
+      "sdot z25.s, z8.b, z2.b[2]\n"
+      "sdot z18.s, z9.b, z0.b[2]\n"
+      "sdot z22.s, z9.b, z1.b[2]\n"
+      "sdot z26.s, z9.b, z2.b[2]\n"
+      "sdot z19.s, z10.b, z0.b[2]\n"
+      "sdot z23.s, z10.b, z1.b[2]\n"
+      "sdot z27.s, z10.b, z2.b[2]\n"
+      "ble 41f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "sdot z20.s, z4.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z24.s, z4.b, z2.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z17.s, z5.b, z0.b[3]\n"
+      "sdot z21.s, z5.b, z1.b[3]\n"
+      "sdot z25.s, z5.b, z2.b[3]\n"
+      "sdot z18.s, z6.b, z0.b[3]\n"
+      "sdot z22.s, z6.b, z1.b[3]\n"
+      "sdot z26.s, z6.b, z2.b[3]\n"
+      "sdot z19.s, z7.b, z0.b[3]\n"
+      "sdot z23.s, z7.b, z1.b[3]\n"
+      "sdot z27.s, z7.b, z2.b[3]\n"
+      "41:"  // Height 3: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 42f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z12.s, z1.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "42:"  // Height 3: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 35b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbnz %x[flags], #31, 43f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z3.s }, p2/Z, [x19]\n"
+      "neg z3.s, p2/M, z3.s\n"
+      "mov x20, #0x4\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "saddv d11, p0, z11.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "saddv d12, p0, z12.s\n"
+      "mov x19, #0x4\n"
+      "mov z11.s, z11.s[0]\n"
+      "whilelt p0.s, XZR, x19\n"
+      "mov z12.s, z12.s[0]\n"
+      "saddv d13, p0, z13.s\n"
+      "mul z11.s, p2/M, z11.s, z3.s\n"
+      "mul z12.s, p2/M, z12.s, z3.s\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z13.s, p2/M, z13.s, z3.s\n"
+      "43:"  // Height 3: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z20.s, z20.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z21.s, z21.s, z12.s\n"
+      "add z22.s, z22.s, z12.s\n"
+      "add z23.s, z23.s, z12.s\n"
+      "add z24.s, z24.s, z13.s\n"
+      "add z25.s, z25.s, z13.s\n"
+      "add z26.s, z26.s, z13.s\n"
+      "add z27.s, z27.s, z13.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
+      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
+      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "tbz %x[flags], #5, 44f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "and z8.d, z20.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "and z9.d, z21.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z20.s, z20.s, z8.s\n"
+      "sqadd z21.s, z21.s, z9.s\n"
+      "sqadd z22.s, z22.s, z10.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "and z6.d, z25.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z5.s\n"
+      "and z7.d, z26.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z8.d, z27.d, z0.d\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z26.s, z26.s, z7.s\n"
+      "sqadd z27.s, z27.s, z8.s\n"
+      "44:"  // Height 3: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "add z26.s, z26.s, z4.s\n"
+      "addvl x25, x25, #1\n"
+      "add z27.s, z27.s, z4.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "addvl x23, x23, #1\n"
+      "45:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 33b\n"
+      "b 62f\n"
+      "46:"  // Height 4
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 47f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "ldr x21, [%x[output_ptr], #0x18]\n"
+      "add x25, x25, x19\n"
+      "add %x[output_ptr], %x[output_ptr], #0x20\n"
+      "add x23, x23, x19\n"
+      "add x21, x21, x19\n"
+      "b 48f\n"
+      "47:"  // Height 4: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "add x21, x23, x19\n"
+      "add %x[output_ptr], x21, x19\n"
+      "48:"  // Height 4: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "49:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "50:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 51f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x28, 52f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 52f\n"
+      "51:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "52:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "ble 55f\n"
+      "53:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z4.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z5.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z20.s, z4.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z24.s, z4.b, z2.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "sdot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z25.s, z5.b, z2.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "sdot z28.s, z4.b, z3.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "sdot z29.s, z5.b, z3.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "sdot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "sdot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "sdot z26.s, z6.b, z2.b[0]\n"
+      "sdot z30.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "sdot z19.s, z7.b, z0.b[0]\n"
+      "sdot z23.s, z7.b, z1.b[0]\n"
+      "sdot z27.s, z7.b, z2.b[0]\n"
+      "sdot z31.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "sdot z16.s, z8.b, z0.b[1]\n"
+      "sdot z20.s, z8.b, z1.b[1]\n"
+      "sdot z24.s, z8.b, z2.b[1]\n"
+      "sdot z28.s, z8.b, z3.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "sdot z17.s, z9.b, z0.b[1]\n"
+      "sdot z21.s, z9.b, z1.b[1]\n"
+      "sdot z25.s, z9.b, z2.b[1]\n"
+      "sdot z29.s, z9.b, z3.b[1]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "sdot z18.s, z10.b, z0.b[1]\n"
+      "sdot z22.s, z10.b, z1.b[1]\n"
+      "sdot z26.s, z10.b, z2.b[1]\n"
+      "sdot z30.s, z10.b, z3.b[1]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "sdot z19.s, z4.b, z0.b[1]\n"
+      "sdot z23.s, z4.b, z1.b[1]\n"
+      "sdot z27.s, z4.b, z2.b[1]\n"
+      "sdot z31.s, z4.b, z3.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "sdot z16.s, z5.b, z0.b[2]\n"
+      "sdot z20.s, z5.b, z1.b[2]\n"
+      "sdot z24.s, z5.b, z2.b[2]\n"
+      "sdot z28.s, z5.b, z3.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "sdot z17.s, z6.b, z0.b[2]\n"
+      "sdot z21.s, z6.b, z1.b[2]\n"
+      "sdot z25.s, z6.b, z2.b[2]\n"
+      "sdot z29.s, z6.b, z3.b[2]\n"
+      "sdot z18.s, z7.b, z0.b[2]\n"
+      "sdot z22.s, z7.b, z1.b[2]\n"
+      "sdot z26.s, z7.b, z2.b[2]\n"
+      "sdot z30.s, z7.b, z3.b[2]\n"
+      "sdot z19.s, z8.b, z0.b[2]\n"
+      "sdot z23.s, z8.b, z1.b[2]\n"
+      "sdot z27.s, z8.b, z2.b[2]\n"
+      "sdot z31.s, z8.b, z3.b[2]\n"
+      "sdot z16.s, z9.b, z0.b[3]\n"
+      "sdot z20.s, z9.b, z1.b[3]\n"
+      "sdot z24.s, z9.b, z2.b[3]\n"
+      "sdot z28.s, z9.b, z3.b[3]\n"
+      "sdot z17.s, z10.b, z0.b[3]\n"
+      "sdot z21.s, z10.b, z1.b[3]\n"
+      "sdot z25.s, z10.b, z2.b[3]\n"
+      "sdot z29.s, z10.b, z3.b[3]\n"
+      "sdot z18.s, z4.b, z0.b[3]\n"
+      "sdot z22.s, z4.b, z1.b[3]\n"
+      "sdot z26.s, z4.b, z2.b[3]\n"
+      "sdot z30.s, z4.b, z3.b[3]\n"
+      "sdot z19.s, z5.b, z0.b[3]\n"
+      "sdot z23.s, z5.b, z1.b[3]\n"
+      "sdot z27.s, z5.b, z2.b[3]\n"
+      "sdot z31.s, z5.b, z3.b[3]\n"
+      "tbnz %x[flags], #31, 54f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z12.s, z1.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "sdot z14.s, z3.b, z15.b\n"
+      "54:"  // Height 4: Multiply loop: unique 7: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 53b\n"
+      "55:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z20.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z24.s, z6.b, z2.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "sdot z21.s, z7.b, z1.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z28.s, z6.b, z3.b[0]\n"
+      "sdot z25.s, z7.b, z2.b[0]\n"
+      "sdot z29.s, z7.b, z3.b[0]\n"
+      "sdot z18.s, z8.b, z0.b[0]\n"
+      "sdot z22.s, z8.b, z1.b[0]\n"
+      "sdot z26.s, z8.b, z2.b[0]\n"
+      "sdot z30.s, z8.b, z3.b[0]\n"
+      "sdot z19.s, z9.b, z0.b[0]\n"
+      "sdot z23.s, z9.b, z1.b[0]\n"
+      "sdot z27.s, z9.b, z2.b[0]\n"
+      "sdot z31.s, z9.b, z3.b[0]\n"
+      "ble 56f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z20.s, z10.b, z1.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z24.s, z10.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z28.s, z10.b, z3.b[1]\n"
+      "sdot z17.s, z4.b, z0.b[1]\n"
+      "sdot z21.s, z4.b, z1.b[1]\n"
+      "sdot z25.s, z4.b, z2.b[1]\n"
+      "sdot z29.s, z4.b, z3.b[1]\n"
+      "sdot z18.s, z5.b, z0.b[1]\n"
+      "sdot z22.s, z5.b, z1.b[1]\n"
+      "sdot z26.s, z5.b, z2.b[1]\n"
+      "sdot z30.s, z5.b, z3.b[1]\n"
+      "sdot z19.s, z6.b, z0.b[1]\n"
+      "sdot z23.s, z6.b, z1.b[1]\n"
+      "sdot z27.s, z6.b, z2.b[1]\n"
+      "sdot z31.s, z6.b, z3.b[1]\n"
+      "ble 56f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z20.s, z7.b, z1.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z24.s, z7.b, z2.b[2]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z28.s, z7.b, z3.b[2]\n"
+      "sdot z17.s, z8.b, z0.b[2]\n"
+      "sdot z21.s, z8.b, z1.b[2]\n"
+      "sdot z25.s, z8.b, z2.b[2]\n"
+      "sdot z29.s, z8.b, z3.b[2]\n"
+      "sdot z18.s, z9.b, z0.b[2]\n"
+      "sdot z22.s, z9.b, z1.b[2]\n"
+      "sdot z26.s, z9.b, z2.b[2]\n"
+      "sdot z30.s, z9.b, z3.b[2]\n"
+      "sdot z19.s, z10.b, z0.b[2]\n"
+      "sdot z23.s, z10.b, z1.b[2]\n"
+      "sdot z27.s, z10.b, z2.b[2]\n"
+      "sdot z31.s, z10.b, z3.b[2]\n"
+      "ble 56f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "sdot z20.s, z4.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z24.s, z4.b, z2.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z28.s, z4.b, z3.b[3]\n"
+      "sdot z17.s, z5.b, z0.b[3]\n"
+      "sdot z21.s, z5.b, z1.b[3]\n"
+      "sdot z25.s, z5.b, z2.b[3]\n"
+      "sdot z29.s, z5.b, z3.b[3]\n"
+      "sdot z18.s, z6.b, z0.b[3]\n"
+      "sdot z22.s, z6.b, z1.b[3]\n"
+      "sdot z26.s, z6.b, z2.b[3]\n"
+      "sdot z30.s, z6.b, z3.b[3]\n"
+      "sdot z19.s, z7.b, z0.b[3]\n"
+      "sdot z23.s, z7.b, z1.b[3]\n"
+      "sdot z27.s, z7.b, z2.b[3]\n"
+      "sdot z31.s, z7.b, z3.b[3]\n"
+      "56:"  // Height 4: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 57f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z12.s, z1.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "sdot z14.s, z3.b, z15.b\n"
+      "57:"  // Height 4: Multiply loop: unique 8: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 50b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbnz %x[flags], #31, 58f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "neg z4.s, p2/M, z4.s\n"
+      "mov x20, #0x4\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "saddv d11, p0, z11.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "saddv d12, p0, z12.s\n"
+      "mov x19, #0x4\n"
+      "mov z11.s, z11.s[0]\n"
+      "whilelt p0.s, XZR, x19\n"
+      "mov x19, #0x4\n"
+      "mov z12.s, z12.s[0]\n"
+      "saddv d13, p0, z13.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "mul z11.s, p2/M, z11.s, z4.s\n"
+      "saddv d14, p0, z14.s\n"
+      "mul z12.s, p2/M, z12.s, z4.s\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z13.s, p2/M, z13.s, z4.s\n"
+      "mov z14.s, z14.s[0]\n"
+      "mul z14.s, p2/M, z14.s, z4.s\n"
+      "58:"  // Height 4: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z20.s, z20.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z21.s, z21.s, z12.s\n"
+      "add z22.s, z22.s, z12.s\n"
+      "add z23.s, z23.s, z12.s\n"
+      "add z24.s, z24.s, z13.s\n"
+      "add z25.s, z25.s, z13.s\n"
+      "add z26.s, z26.s, z13.s\n"
+      "add z27.s, z27.s, z13.s\n"
+      "add z28.s, z28.s, z14.s\n"
+      "add z29.s, z29.s, z14.s\n"
+      "add z30.s, z30.s, z14.s\n"
+      "add z31.s, z31.s, z14.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      "add z28.s, z28.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z29.s, z29.s, z1.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z31.s, z31.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
+      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
+      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
+      ".inst 0x04a477bd  // sqrdmulh z29.s, z29.s, z4.s\n"
+      ".inst 0x04a477de  // sqrdmulh z30.s, z30.s, z4.s\n"
+      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      "tbz %x[flags], #5, 59f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "and z8.d, z20.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "and z9.d, z21.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z20.s, z20.s, z8.s\n"
+      "sqadd z21.s, z21.s, z9.s\n"
+      "sqadd z22.s, z22.s, z10.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "and z6.d, z25.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z5.s\n"
+      "and z7.d, z26.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z8.d, z27.d, z0.d\n"
+      "and z9.d, z28.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "and z10.d, z29.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "and z4.d, z30.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z26.s, z26.s, z7.s\n"
+      "and z5.d, z31.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z27.s, z27.s, z8.s\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z9.s\n"
+      "sqadd z29.s, z29.s, z10.s\n"
+      "sqadd z30.s, z30.s, z4.s\n"
+      "sqadd z31.s, z31.s, z5.s\n"
+      "59:"  // Height 4: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "add z26.s, z26.s, z4.s\n"
+      "addvl x25, x25, #1\n"
+      "add z27.s, z27.s, z4.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "add z28.s, z28.s, z4.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "smin z28.s, p2/M, z28.s, z6.s\n"
+      ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
+      ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "smax z28.s, p2/M, z28.s, z5.s\n"
+      "add z29.s, z29.s, z4.s\n"
+      "add z30.s, z30.s, z4.s\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "smin z29.s, p2/M, z29.s, z6.s\n"
+      "addvl x23, x23, #1\n"
+      "smin z30.s, p2/M, z30.s, z6.s\n"
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "smax z29.s, p2/M, z29.s, z5.s\n"
+      "add z31.s, z31.s, z4.s\n"
+      "smax z30.s, p2/M, z30.s, z5.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      "smin z31.s, p2/M, z31.s, z6.s\n"
+      "smax z31.s, p2/M, z31.s, z5.s\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p1, [x21]\n"
+      "addvl x21, x21, #1\n"
+      "60:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 48b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 62f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "61:"  // Update direct input
+      "mov x19, #0x4\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "62:"  // Exit
+
+      : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
new file mode 100644
index 0000000000..d8562898aa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<int8_t>, \
+   size_t, size_t, \
+   const int8_t *, \
+   IndirectOutputArg<int8_t>, \
+   const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_s8qs_dot_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8qs_dot_6x4VL
+{
+public:
+    typedef int8_t operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<int32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_s8qs_dot_6x4VL;
+
+    cls_sve_hybrid_s8qs_dot_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..4a4af6356c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
@@ -0,0 +1,2770 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8qs_dot_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+    struct KernelArgs {
+        const int32_t *multiplier_ptr = {};
+        const int32_t *shift_ptr = {};
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->per_channel_requant) {
+        flags |= 0x10;
+        ka.multiplier_ptr=qp->per_channel_muls + col_base;
+        ka.shift_ptr=qp->per_channel_right_shifts + col_base;
+    }
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 71f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 57f\n"
+      "beq 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x15\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x11, #0x10\n"
+      "ble 9f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "bgt 8b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "ble 10f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "ble 10f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "ble 10f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "10:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 5b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "ld1w { z0.s }, p2/Z, [x16]\n"
+      "add z8.s, z8.s, z0.s\n"
+      "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+      "add z9.s, z9.s, z1.s\n"
+      "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+      "addvl x16, x16, #4\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "tbz %x[flags], #4, 11f\n"
+      "ld1w { z0.s }, p2/Z, [x17]\n"
+      "ld1w { z4.s }, p2/Z, [x8]\n"
+      "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+      "addvl x17, x17, #4\n"
+      "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+      "addvl x8, x8, #4\n"
+      "b 12f\n"
+      "11:"  // Height 1: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x19]\n"
+      "mov z1.d, z0.d\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "12:"  // Height 1: parameters loaded
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      "tbz %x[flags], #5, 13f\n"
+      "and z4.d, z8.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z9.d, z1.d\n"
+      "and z6.d, z10.d, z2.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "13:"  // Height 1: no shift correction
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add z8.s, z8.s, z4.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "14:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 3b\n"
+      "b 86f\n"
+      "15:"  // Height 2
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 16f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "17:"  // Height 2: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x15\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "18:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "19:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 21f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "b 21f\n"
+      "20:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "21:"  // Height 2: input setup done
+      "cmp x11, #0x10\n"
+      "ble 23f\n"
+      "22:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "bgt 22b\n"
+      "23:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "ble 24f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "ble 24f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "ble 24f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "24:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 19b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "ld1w { z0.s }, p2/Z, [x16]\n"
+      "add z8.s, z8.s, z0.s\n"
+      "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+      "add z12.s, z12.s, z0.s\n"
+      "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+      "add z9.s, z9.s, z1.s\n"
+      "addvl x16, x16, #4\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z14.s, z14.s, z2.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "tbz %x[flags], #4, 25f\n"
+      "ld1w { z0.s }, p2/Z, [x17]\n"
+      "ld1w { z4.s }, p2/Z, [x8]\n"
+      "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+      "addvl x17, x17, #4\n"
+      "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+      "addvl x8, x8, #4\n"
+      "b 26f\n"
+      "25:"  // Height 2: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x19]\n"
+      "mov z1.d, z0.d\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "26:"  // Height 2: parameters loaded
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a4758c  // sqrdmulh z12.s, z12.s, z4.s\n"
+      ".inst 0x04a575ad  // sqrdmulh z13.s, z13.s, z5.s\n"
+      ".inst 0x04a675ce  // sqrdmulh z14.s, z14.s, z6.s\n"
+      ".inst 0x04a775ef  // sqrdmulh z15.s, z15.s, z7.s\n"
+      "tbz %x[flags], #5, 27f\n"
+      "and z4.d, z8.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z9.d, z1.d\n"
+      "and z6.d, z10.d, z2.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z4.d, z12.d, z0.d\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "and z5.d, z13.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z4.s\n"
+      "and z7.d, z15.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z13.s, z13.s, z5.s\n"
+      "sqadd z14.s, z14.s, z6.s\n"
+      "sqadd z15.s, z15.s, z7.s\n"
+      "27:"  // Height 2: no shift correction
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x13]\n"
+      "add z13.s, z13.s, z4.s\n"
+      "addvl x13, x13, #1\n"
+      ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "28:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 17b\n"
+      "b 86f\n"
+      "29:"  // Height 3
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 30f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "add x27, x27, x19\n"
+      "b 31f\n"
+      "30:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "31:"  // Height 3: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x15\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "32:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "33:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 34f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 35f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "b 35f\n"
+      "34:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "35:"  // Height 3: input setup done
+      "cmp x11, #0x10\n"
+      "ble 37f\n"
+      "36:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "bgt 36b\n"
+      "37:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "ble 38f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "ble 38f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "ble 38f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "38:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 33b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "ld1w { z0.s }, p2/Z, [x16]\n"
+      "add z8.s, z8.s, z0.s\n"
+      "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+      "add z12.s, z12.s, z0.s\n"
+      "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+      "addvl x16, x16, #4\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z14.s, z14.s, z2.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "tbz %x[flags], #4, 39f\n"
+      "ld1w { z0.s }, p2/Z, [x17]\n"
+      "ld1w { z4.s }, p2/Z, [x8]\n"
+      "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+      "addvl x17, x17, #4\n"
+      "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+      "addvl x8, x8, #4\n"
+      "b 40f\n"
+      "39:"  // Height 3: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x19]\n"
+      "mov z1.d, z0.d\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "40:"  // Height 3: parameters loaded
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a4758c  // sqrdmulh z12.s, z12.s, z4.s\n"
+      ".inst 0x04a575ad  // sqrdmulh z13.s, z13.s, z5.s\n"
+      ".inst 0x04a675ce  // sqrdmulh z14.s, z14.s, z6.s\n"
+      ".inst 0x04a775ef  // sqrdmulh z15.s, z15.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      "tbz %x[flags], #5, 41f\n"
+      "and z4.d, z8.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z9.d, z1.d\n"
+      "and z6.d, z10.d, z2.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z4.d, z12.d, z0.d\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "and z5.d, z13.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z4.s\n"
+      "and z7.d, z15.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z13.s, z13.s, z5.s\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z14.s, z14.s, z6.s\n"
+      "and z5.d, z17.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z7.s\n"
+      "and z6.d, z18.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z7.d, z19.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "41:"  // Height 3: no shift correction
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x13]\n"
+      "add z13.s, z13.s, z4.s\n"
+      "addvl x13, x13, #1\n"
+      ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p1, [x9]\n"
+      "add z18.s, z18.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "42:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 31b\n"
+      "b 86f\n"
+      "43:"  // Height 4
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 44f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19\n"
+      "add x25, x25, x19\n"
+      "b 45f\n"
+      "44:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "add x25, x27, x19\n"
+      "45:"  // Height 4: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x15\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "46:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "47:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 48f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 49f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 49f\n"
+      "48:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "49:"  // Height 4: input setup done
+      "cmp x11, #0x10\n"
+      "ble 51f\n"
+      "50:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "bgt 50b\n"
+      "51:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "ble 52f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "ble 52f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "ble 52f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "52:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 47b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "ld1w { z0.s }, p2/Z, [x16]\n"
+      "add z8.s, z8.s, z0.s\n"
+      "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+      "add z12.s, z12.s, z0.s\n"
+      "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+      "addvl x16, x16, #4\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z14.s, z14.s, z2.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "tbz %x[flags], #4, 53f\n"
+      "ld1w { z0.s }, p2/Z, [x17]\n"
+      "ld1w { z4.s }, p2/Z, [x8]\n"
+      "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+      "addvl x17, x17, #4\n"
+      "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+      "addvl x8, x8, #4\n"
+      "b 54f\n"
+      "53:"  // Height 4: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x19]\n"
+      "mov z1.d, z0.d\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "54:"  // Height 4: parameters loaded
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a4758c  // sqrdmulh z12.s, z12.s, z4.s\n"
+      ".inst 0x04a575ad  // sqrdmulh z13.s, z13.s, z5.s\n"
+      ".inst 0x04a675ce  // sqrdmulh z14.s, z14.s, z6.s\n"
+      ".inst 0x04a775ef  // sqrdmulh z15.s, z15.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a576b5  // sqrdmulh z21.s, z21.s, z5.s\n"
+      ".inst 0x04a676d6  // sqrdmulh z22.s, z22.s, z6.s\n"
+      ".inst 0x04a776f7  // sqrdmulh z23.s, z23.s, z7.s\n"
+      "tbz %x[flags], #5, 55f\n"
+      "and z4.d, z8.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z9.d, z1.d\n"
+      "and z6.d, z10.d, z2.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z4.d, z12.d, z0.d\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "and z5.d, z13.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z4.s\n"
+      "and z7.d, z15.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z13.s, z13.s, z5.s\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z14.s, z14.s, z6.s\n"
+      "and z5.d, z17.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z7.s\n"
+      "and z6.d, z18.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z7.d, z19.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "and z4.d, z20.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "and z5.d, z21.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "and z6.d, z22.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z4.s\n"
+      "and z7.d, z23.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z5.s\n"
+      "sqadd z22.s, z22.s, z6.s\n"
+      "sqadd z23.s, z23.s, z7.s\n"
+      "55:"  // Height 4: no shift correction
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x13]\n"
+      "add z13.s, z13.s, z4.s\n"
+      "addvl x13, x13, #1\n"
+      ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p1, [x9]\n"
+      "add z18.s, z18.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      ".inst 0x44828835  // srshl z21.s, p2/M, z21.s, z1.s\n"
+      ".inst 0x44828856  // srshl z22.s, p2/M, z22.s, z2.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "add z21.s, z21.s, z4.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "addvl x27, x27, #1\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      ".inst 0x44828877  // srshl z23.s, p2/M, z23.s, z3.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "56:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 45b\n"
+      "b 86f\n"
+      "57:"  // Height 5
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 58f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "b 59f\n"
+      "58:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "add x25, x27, x19\n"
+      "add x23, x25, x19\n"
+      "59:"  // Height 5: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x15\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "60:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "61:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 62f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 63f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 63f\n"
+      "62:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "63:"  // Height 5: input setup done
+      "cmp x11, #0x10\n"
+      "ble 65f\n"
+      "64:"  // Height 5: Multiply loop: Main loop head
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "bgt 64b\n"
+      "65:"  // Height 5: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "ble 66f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "ble 66f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "ble 66f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "66:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 61b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "ld1w { z0.s }, p2/Z, [x16]\n"
+      "add z8.s, z8.s, z0.s\n"
+      "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+      "add z12.s, z12.s, z0.s\n"
+      "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+      "addvl x16, x16, #4\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z14.s, z14.s, z2.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      "tbz %x[flags], #4, 67f\n"
+      "ld1w { z0.s }, p2/Z, [x17]\n"
+      "ld1w { z4.s }, p2/Z, [x8]\n"
+      "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+      "addvl x17, x17, #4\n"
+      "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+      "addvl x8, x8, #4\n"
+      "b 68f\n"
+      "67:"  // Height 5: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x19]\n"
+      "mov z1.d, z0.d\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "68:"  // Height 5: parameters loaded
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a4758c  // sqrdmulh z12.s, z12.s, z4.s\n"
+      ".inst 0x04a575ad  // sqrdmulh z13.s, z13.s, z5.s\n"
+      ".inst 0x04a675ce  // sqrdmulh z14.s, z14.s, z6.s\n"
+      ".inst 0x04a775ef  // sqrdmulh z15.s, z15.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a576b5  // sqrdmulh z21.s, z21.s, z5.s\n"
+      ".inst 0x04a676d6  // sqrdmulh z22.s, z22.s, z6.s\n"
+      ".inst 0x04a776f7  // sqrdmulh z23.s, z23.s, z7.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a57739  // sqrdmulh z25.s, z25.s, z5.s\n"
+      ".inst 0x04a6775a  // sqrdmulh z26.s, z26.s, z6.s\n"
+      ".inst 0x04a7777b  // sqrdmulh z27.s, z27.s, z7.s\n"
+      "tbz %x[flags], #5, 69f\n"
+      "and z4.d, z8.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z9.d, z1.d\n"
+      "and z6.d, z10.d, z2.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z4.d, z12.d, z0.d\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "and z5.d, z13.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z4.s\n"
+      "and z7.d, z15.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z13.s, z13.s, z5.s\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z14.s, z14.s, z6.s\n"
+      "and z5.d, z17.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z7.s\n"
+      "and z6.d, z18.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z7.d, z19.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "and z4.d, z20.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "and z5.d, z21.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "and z6.d, z22.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z4.s\n"
+      "and z7.d, z23.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z5.s\n"
+      "and z4.d, z24.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z22.s, z22.s, z6.s\n"
+      "and z5.d, z25.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z7.s\n"
+      "and z6.d, z26.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z4.s\n"
+      "and z7.d, z27.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z5.s\n"
+      "sqadd z26.s, z26.s, z6.s\n"
+      "sqadd z27.s, z27.s, z7.s\n"
+      "69:"  // Height 5: no shift correction
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x13]\n"
+      "add z13.s, z13.s, z4.s\n"
+      "addvl x13, x13, #1\n"
+      ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p1, [x9]\n"
+      "add z18.s, z18.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      ".inst 0x44828835  // srshl z21.s, p2/M, z21.s, z1.s\n"
+      ".inst 0x44828856  // srshl z22.s, p2/M, z22.s, z2.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "add z21.s, z21.s, z4.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "addvl x27, x27, #1\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      ".inst 0x44828877  // srshl z23.s, p2/M, z23.s, z3.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      ".inst 0x44828839  // srshl z25.s, p2/M, z25.s, z1.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482885a  // srshl z26.s, p2/M, z26.s, z2.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "add z26.s, z26.s, z4.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      ".inst 0x4482887b  // srshl z27.s, p2/M, z27.s, z3.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "add z27.s, z27.s, z4.s\n"
+      "addvl x25, x25, #1\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "addvl x23, x23, #1\n"
+      "70:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 59b\n"
+      "b 86f\n"
+      "71:"  // Height 6
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "add x21, x21, x19\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "add x25, x27, x19\n"
+      "add x23, x25, x19\n"
+      "add x21, x23, x19\n"
+      "add %x[output_ptr], x21, x19\n"
+      "73:"  // Height 6: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x15\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "74:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "75:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 76f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 77f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 77f\n"
+      "76:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "77:"  // Height 6: input setup done
+      "cmp x11, #0x10\n"
+      "ble 79f\n"
+      "78:"  // Height 6: Multiply loop: Main loop head
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "sdot z28.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sdot z29.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "sdot z30.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "sdot z31.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "sdot z28.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "sdot z29.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "sdot z30.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "sdot z31.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "sdot z28.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "sdot z29.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "sdot z30.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "sdot z31.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "sdot z28.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "sdot z29.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z30.s, z6.b, z5.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z31.s, z7.b, z5.b[3]\n"
+      "bgt 78b\n"
+      "79:"  // Height 6: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "add x20, x20, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "sdot z28.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "sdot z29.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "sdot z30.s, z6.b, z5.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "sdot z31.s, z7.b, z5.b[0]\n"
+      "ble 80f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "sdot z28.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "sdot z29.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "sdot z30.s, z6.b, z5.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "sdot z31.s, z7.b, z5.b[1]\n"
+      "ble 80f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "sdot z28.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "sdot z29.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "sdot z30.s, z6.b, z5.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "sdot z31.s, z7.b, z5.b[2]\n"
+      "ble 80f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "sdot z28.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "sdot z29.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z30.s, z6.b, z5.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z31.s, z7.b, z5.b[3]\n"
+      "80:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 75b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "ld1w { z0.s }, p2/Z, [x16]\n"
+      "add z8.s, z8.s, z0.s\n"
+      "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+      "add z12.s, z12.s, z0.s\n"
+      "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+      "addvl x16, x16, #4\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z14.s, z14.s, z2.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      "add z28.s, z28.s, z0.s\n"
+      "add z29.s, z29.s, z1.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z31.s, z31.s, z3.s\n"
+      "tbz %x[flags], #4, 81f\n"
+      "ld1w { z0.s }, p2/Z, [x17]\n"
+      "ld1w { z4.s }, p2/Z, [x8]\n"
+      "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+      "addvl x17, x17, #4\n"
+      "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+      "addvl x8, x8, #4\n"
+      "b 82f\n"
+      "81:"  // Height 6: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x19]\n"
+      "mov z1.d, z0.d\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "82:"  // Height 6: parameters loaded
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a4758c  // sqrdmulh z12.s, z12.s, z4.s\n"
+      ".inst 0x04a575ad  // sqrdmulh z13.s, z13.s, z5.s\n"
+      ".inst 0x04a675ce  // sqrdmulh z14.s, z14.s, z6.s\n"
+      ".inst 0x04a775ef  // sqrdmulh z15.s, z15.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a576b5  // sqrdmulh z21.s, z21.s, z5.s\n"
+      ".inst 0x04a676d6  // sqrdmulh z22.s, z22.s, z6.s\n"
+      ".inst 0x04a776f7  // sqrdmulh z23.s, z23.s, z7.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a57739  // sqrdmulh z25.s, z25.s, z5.s\n"
+      ".inst 0x04a6775a  // sqrdmulh z26.s, z26.s, z6.s\n"
+      ".inst 0x04a7777b  // sqrdmulh z27.s, z27.s, z7.s\n"
+      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
+      ".inst 0x04a577bd  // sqrdmulh z29.s, z29.s, z5.s\n"
+      ".inst 0x04a677de  // sqrdmulh z30.s, z30.s, z6.s\n"
+      ".inst 0x04a777ff  // sqrdmulh z31.s, z31.s, z7.s\n"
+      "tbz %x[flags], #5, 83f\n"
+      "and z4.d, z8.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z9.d, z1.d\n"
+      "and z6.d, z10.d, z2.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z4.d, z12.d, z0.d\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "and z5.d, z13.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z4.s\n"
+      "and z7.d, z15.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z13.s, z13.s, z5.s\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z14.s, z14.s, z6.s\n"
+      "and z5.d, z17.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z7.s\n"
+      "and z6.d, z18.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z7.d, z19.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "and z4.d, z20.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "and z5.d, z21.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "and z6.d, z22.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z4.s\n"
+      "and z7.d, z23.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z5.s\n"
+      "and z4.d, z24.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z22.s, z22.s, z6.s\n"
+      "and z5.d, z25.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z7.s\n"
+      "and z6.d, z26.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z4.s\n"
+      "and z7.d, z27.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z5.s\n"
+      "and z4.d, z28.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z26.s, z26.s, z6.s\n"
+      "and z5.d, z29.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z27.s, z27.s, z7.s\n"
+      "and z6.d, z30.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z4.s\n"
+      "and z7.d, z31.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z29.s, z29.s, z5.s\n"
+      "sqadd z30.s, z30.s, z6.s\n"
+      "sqadd z31.s, z31.s, z7.s\n"
+      "83:"  // Height 6: no shift correction
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x13]\n"
+      "add z13.s, z13.s, z4.s\n"
+      "addvl x13, x13, #1\n"
+      ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p1, [x9]\n"
+      "add z18.s, z18.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      ".inst 0x44828835  // srshl z21.s, p2/M, z21.s, z1.s\n"
+      ".inst 0x44828856  // srshl z22.s, p2/M, z22.s, z2.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "add z21.s, z21.s, z4.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "addvl x27, x27, #1\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      ".inst 0x44828877  // srshl z23.s, p2/M, z23.s, z3.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      ".inst 0x44828839  // srshl z25.s, p2/M, z25.s, z1.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482885a  // srshl z26.s, p2/M, z26.s, z2.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "add z26.s, z26.s, z4.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      ".inst 0x4482887b  // srshl z27.s, p2/M, z27.s, z3.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "add z27.s, z27.s, z4.s\n"
+      "addvl x25, x25, #1\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
+      ".inst 0x4482883d  // srshl z29.s, p2/M, z29.s, z1.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      ".inst 0x4482885e  // srshl z30.s, p2/M, z30.s, z2.s\n"
+      "add z28.s, z28.s, z4.s\n"
+      "add z29.s, z29.s, z4.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "add z30.s, z30.s, z4.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "smin z28.s, p2/M, z28.s, z6.s\n"
+      "smin z29.s, p2/M, z29.s, z6.s\n"
+      "smin z30.s, p2/M, z30.s, z6.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "smax z28.s, p2/M, z28.s, z5.s\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "smax z29.s, p2/M, z29.s, z5.s\n"
+      "addvl x23, x23, #1\n"
+      "smax z30.s, p2/M, z30.s, z5.s\n"
+      ".inst 0x4482887f  // srshl z31.s, p2/M, z31.s, z3.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      "add z31.s, z31.s, z4.s\n"
+      "smin z31.s, p2/M, z31.s, z6.s\n"
+      "smax z31.s, p2/M, z31.s, z5.s\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p1, [x21]\n"
+      "addvl x21, x21, #1\n"
+      "84:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 73b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 86f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "85:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "86:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
deleted file mode 100644
index b30b8845a6..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2137 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long leftovers = K;
-    const long blocks_count = (K + 3) / 4;
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const int8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(int8_t);
-
-        int32_t *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const int8_t *a_ptr0 = a_ptr0_base;
-            const int8_t *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(int32_t);
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z18.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z19.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "mov z18.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z19.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z20.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z21.s, #0\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "mov z18.s, #0\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "mov z19.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z20.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z21.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z24.s, #0\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z25.s, #0\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z26.s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z27.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "mov z18.s, #0\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "mov z19.s, #0\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                        "mov z20.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z21.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z24.s, #0\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z25.s, #0\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z26.s, #0\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z27.s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z28.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "mov z29.s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "mov z30.s, #0\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z31.s, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1w z28.s, p0/z, [c_ptr3]\n"
-                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z28.s, z8.b, z3.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "sdot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "sdot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "sdot z31.s, z11.b, z3.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z28.s, z12.b, z3.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "sdot z31.s, z15.b, z3.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z28.s, z8.b, z3.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z30.s, z10.b, z3.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z31.s, z11.b, z3.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z28.s, z12.b, z3.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z30.s, z14.b, z3.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
-                        "sdot z31.s, z15.b, z3.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "sdot z28.s, z8.b, z7.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z30.s, z10.b, z7.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "sdot z31.s, z11.b, z7.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z28.s, z12.b, z7.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z30.s, z14.b, z7.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "sdot z31.s, z15.b, z7.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z28.s, z8.b, z7.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z30.s, z10.b, z7.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "sdot z31.s, z11.b, z7.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z28.s, z12.b, z7.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z29.s, z13.b, z7.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z30.s, z14.b, z7.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "sdot z31.s, z15.b, z7.b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z28.s, z8.b, z3.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "sdot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "sdot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "sdot z31.s, z11.b, z3.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z28.s, z12.b, z3.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "sdot z31.s, z15.b, z3.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z28.s, z8.b, z3.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z30.s, z10.b, z3.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z31.s, z11.b, z3.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z28.s, z12.b, z3.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z30.s, z14.b, z3.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                        "sdot z31.s, z15.b, z3.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "sdot z28.s, z8.b, z7.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z30.s, z10.b, z7.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "sdot z31.s, z11.b, z7.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z28.s, z12.b, z7.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z30.s, z14.b, z7.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "sdot z31.s, z15.b, z7.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z28.s, z8.b, z7.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z30.s, z10.b, z7.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "sdot z31.s, z11.b, z7.b[2]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z28.s, z12.b, z7.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z29.s, z13.b, z7.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z30.s, z14.b, z7.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "sdot z31.s, z15.b, z7.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "sdot z28.s, z8.b, z3.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "sdot z29.s, z9.b, z3.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "sdot z30.s, z10.b, z3.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "sdot z31.s, z11.b, z3.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z28.s, z12.b, z3.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z30.s, z14.b, z3.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "sdot z31.s, z15.b, z3.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z28.s, z8.b, z3.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z30.s, z10.b, z3.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z31.s, z11.b, z3.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z28.s, z12.b, z3.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z30.s, z14.b, z3.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "sdot z31.s, z15.b, z3.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "sdot z28.s, z8.b, z3.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "sdot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        "sdot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "sdot z31.s, z11.b, z3.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z28.s, z12.b, z3.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "sdot z31.s, z15.b, z3.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z28.s, z8.b, z3.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z30.s, z10.b, z3.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z31.s, z11.b, z3.b[2]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z28.s, z12.b, z3.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z30.s, z14.b, z3.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "sdot z31.s, z15.b, z3.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "sdot z28.s, z8.b, z7.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z30.s, z10.b, z7.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "sdot z31.s, z11.b, z7.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z28.s, z12.b, z7.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z30.s, z14.b, z7.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "sdot z31.s, z15.b, z7.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z28.s, z8.b, z7.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z30.s, z10.b, z7.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "sdot z31.s, z11.b, z7.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z28.s, z12.b, z7.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z29.s, z13.b, z7.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z30.s, z14.b, z7.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "sdot z31.s, z15.b, z7.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z28.s, p0, [c_ptr3]\n"
-                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
new file mode 100644
index 0000000000..1aebedb861
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<int8_t>, \
+   size_t, size_t, \
+   const int8_t *, \
+   IndirectOutputArg<int32_t>, \
+   const int32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_s8s32_dot_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8s32_dot_6x4VL
+{
+public:
+    typedef int8_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<int32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_s8s32_dot_6x4VL;
+
+    cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..cae9bf329f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
@@ -0,0 +1,1904 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8s32_dot_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+    const int32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 61f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 49f\n"
+      "beq 37f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 25f\n"
+      "beq 13f\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 8f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "cmp x11, #0x10\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "ble 11f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "ble 11f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "ble 11f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 6b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "12:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 3b\n"
+      "b 74f\n"
+      "13:"  // Height 2
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 14f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 15f\n"
+      "14:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "15:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 16f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "b 17f\n"
+      "16:"  // Height 2: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 20f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "20:"  // Height 2: input setup done
+      "cmp x11, #0x10\n"
+      "ble 22f\n"
+      "21:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "bgt 21b\n"
+      "22:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "ble 23f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "ble 23f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "ble 23f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "23:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 18b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "24:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 15b\n"
+      "b 74f\n"
+      "25:"  // Height 3
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 26f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 27f\n"
+      "26:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "27:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 28f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "b 29f\n"
+      "28:"  // Height 3: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "29:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "30:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 31f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 32f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "b 32f\n"
+      "31:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "32:"  // Height 3: input setup done
+      "cmp x11, #0x10\n"
+      "ble 34f\n"
+      "33:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "bgt 33b\n"
+      "34:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "ble 35f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "ble 35f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "ble 35f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "35:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 30b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "36:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 27b\n"
+      "b 74f\n"
+      "37:"  // Height 4
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 38f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 39f\n"
+      "38:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "39:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 40f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "b 41f\n"
+      "40:"  // Height 4: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "41:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "42:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 43f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 44f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 44f\n"
+      "43:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "44:"  // Height 4: input setup done
+      "cmp x11, #0x10\n"
+      "ble 46f\n"
+      "45:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "bgt 45b\n"
+      "46:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "ble 47f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "ble 47f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "ble 47f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "47:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 42b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "48:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 39b\n"
+      "b 74f\n"
+      "49:"  // Height 5
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 50f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 51f\n"
+      "50:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "51:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 52f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "b 53f\n"
+      "52:"  // Height 5: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "53:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "54:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 55f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 56f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 56f\n"
+      "55:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "56:"  // Height 5: input setup done
+      "cmp x11, #0x10\n"
+      "ble 58f\n"
+      "57:"  // Height 5: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "bgt 57b\n"
+      "58:"  // Height 5: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "ble 59f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "ble 59f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "ble 59f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "59:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 54b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "60:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 51b\n"
+      "b 74f\n"
+      "61:"  // Height 6
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 62f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 63f\n"
+      "62:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "63:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 64f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x21]\n"
+      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "b 65f\n"
+      "64:"  // Height 6: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "65:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "66:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 68f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 68f\n"
+      "67:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "68:"  // Height 6: input setup done
+      "cmp x11, #0x10\n"
+      "ble 70f\n"
+      "69:"  // Height 6: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "sdot z28.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sdot z29.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "sdot z30.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "sdot z31.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "sdot z28.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "sdot z29.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "sdot z30.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "sdot z31.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "sdot z28.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "sdot z29.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "sdot z30.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "sdot z31.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "sdot z28.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "sdot z29.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z30.s, z6.b, z5.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z31.s, z7.b, z5.b[3]\n"
+      "bgt 69b\n"
+      "70:"  // Height 6: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "add x20, x20, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "sdot z28.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "sdot z29.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "sdot z30.s, z6.b, z5.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "sdot z31.s, z7.b, z5.b[0]\n"
+      "ble 71f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "sdot z28.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "sdot z29.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "sdot z30.s, z6.b, z5.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "sdot z31.s, z7.b, z5.b[1]\n"
+      "ble 71f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "sdot z28.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "sdot z29.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "sdot z30.s, z6.b, z5.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "sdot z31.s, z7.b, z5.b[2]\n"
+      "ble 71f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "sdot z28.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "sdot z29.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z30.s, z6.b, z5.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z31.s, z7.b, z5.b[3]\n"
+      "71:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 66b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "st1w { z28.s }, p4, [x21]\n"
+      "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+      "addvl x21, x21, #4\n"
+      "72:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 63b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 74f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 73f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "73:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "74:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
index c325e522d7..964f7cc2c1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,37 +10,43 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __ARM_FEATURE_SVE
 
-#include <cstdint>
 #include "../std_transforms_sve.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<uint8_t>, \
+   size_t, size_t, \
+   const uint8_t *, \
+   IndirectOutputArg<uint8_t>, \
+   const Requantize32 *, const int32_t *, unsigned int
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void sve_hybrid_u8qa_dot_4x4VL( ARGLIST );
 
-class hybrid_u8u32_dot_4VLx4
+class cls_sve_hybrid_u8qa_dot_4x4VL
 {
 public:
     typedef uint8_t operand_type;
-    typedef uint32_t result_type;
+    typedef uint8_t result_type;
 
-    typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
@@ -60,30 +66,20 @@ public:
 
     static constexpr bool supports_accumulate()
     {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_activation()
-    {
         return false;
     }
 
     StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_hybrid_u8u32_dot_4VLx4;
+    kern_type kernel=sve_hybrid_u8qa_dot_4x4VL;
 
-    hybrid_u8u32_dot_4VLx4(const CPUInfo *)
+    cls_sve_hybrid_u8qa_dot_4x4VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
new file mode 100644
index 0000000000..0a6546b78a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
@@ -0,0 +1,1602 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8qa_dot_4x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 46f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 31f\n"
+      "beq 16f\n"
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "add x9, x9, x19\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "ble 10f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z4.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z5.b, z0.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "udot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "udot z19.s, z7.b, z0.b[0]\n"
+      "udot z16.s, z8.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "udot z17.s, z9.b, z0.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "udot z18.s, z10.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "udot z19.s, z4.b, z0.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "udot z16.s, z5.b, z0.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "udot z17.s, z6.b, z0.b[2]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "udot z18.s, z7.b, z0.b[2]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "udot z19.s, z8.b, z0.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "udot z16.s, z9.b, z0.b[3]\n"
+      "udot z17.s, z10.b, z0.b[3]\n"
+      "udot z18.s, z4.b, z0.b[3]\n"
+      "udot z19.s, z5.b, z0.b[3]\n"
+      "tbnz %x[flags], #31, 9f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "9:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x10\n"
+      "bgt 8b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z6.b, z0.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z7.b, z0.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z18.s, z8.b, z0.b[0]\n"
+      "udot z19.s, z9.b, z0.b[0]\n"
+      "ble 11f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "udot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z17.s, z4.b, z0.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z18.s, z5.b, z0.b[1]\n"
+      "addvl x11, x11, #4\n"
+      "udot z19.s, z6.b, z0.b[1]\n"
+      "ble 11f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "udot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z17.s, z8.b, z0.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z18.s, z9.b, z0.b[2]\n"
+      "addvl x11, x11, #4\n"
+      "udot z19.s, z10.b, z0.b[2]\n"
+      "ble 11f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "udot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z17.s, z5.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z18.s, z6.b, z0.b[3]\n"
+      "udot z19.s, z7.b, z0.b[3]\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 12f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "12:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 5b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbnz %x[flags], #31, 13f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z1.s }, p2/Z, [x19]\n"
+      "neg z1.s, p2/M, z1.s\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x19\n"
+      "uaddv d11, p0, z11.s\n"
+      "mov z11.s, z11.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z1.s\n"
+      "13:"  // Height 1: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z17.s, z17.s, z1.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      "tbz %x[flags], #5, 14f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "14:"  // Height 1: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "15:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 3b\n"
+      "b 62f\n"
+      "16:"  // Height 2
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 17f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "add x25, x25, x19\n"
+      "b 18f\n"
+      "17:"  // Height 2: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "18:"  // Height 2: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "19:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "20:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 21f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x28, 22f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 22f\n"
+      "21:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "22:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "ble 25f\n"
+      "23:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z4.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z5.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      "udot z20.s, z4.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "udot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "udot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "udot z19.s, z7.b, z0.b[0]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "udot z23.s, z7.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "udot z16.s, z8.b, z0.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "udot z20.s, z8.b, z1.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "udot z17.s, z9.b, z0.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "udot z21.s, z9.b, z1.b[1]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "udot z18.s, z10.b, z0.b[1]\n"
+      "udot z22.s, z10.b, z1.b[1]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "udot z19.s, z4.b, z0.b[1]\n"
+      "udot z23.s, z4.b, z1.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "udot z16.s, z5.b, z0.b[2]\n"
+      "udot z20.s, z5.b, z1.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "udot z17.s, z6.b, z0.b[2]\n"
+      "udot z21.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z7.b, z0.b[2]\n"
+      "udot z22.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z8.b, z0.b[2]\n"
+      "udot z23.s, z8.b, z1.b[2]\n"
+      "udot z16.s, z9.b, z0.b[3]\n"
+      "udot z20.s, z9.b, z1.b[3]\n"
+      "udot z17.s, z10.b, z0.b[3]\n"
+      "udot z21.s, z10.b, z1.b[3]\n"
+      "udot z18.s, z4.b, z0.b[3]\n"
+      "udot z22.s, z4.b, z1.b[3]\n"
+      "udot z19.s, z5.b, z0.b[3]\n"
+      "udot z23.s, z5.b, z1.b[3]\n"
+      "tbnz %x[flags], #31, 24f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z12.s, z1.b, z15.b\n"
+      "24:"  // Height 2: Multiply loop: unique 3: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x10\n"
+      "bgt 23b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z7.b, z0.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      "udot z20.s, z6.b, z1.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z21.s, z7.b, z1.b[0]\n"
+      "udot z18.s, z8.b, z0.b[0]\n"
+      "udot z22.s, z8.b, z1.b[0]\n"
+      "udot z19.s, z9.b, z0.b[0]\n"
+      "udot z23.s, z9.b, z1.b[0]\n"
+      "ble 26f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "udot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z20.s, z10.b, z1.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z17.s, z4.b, z0.b[1]\n"
+      "addvl x11, x11, #4\n"
+      "udot z21.s, z4.b, z1.b[1]\n"
+      "udot z18.s, z5.b, z0.b[1]\n"
+      "udot z22.s, z5.b, z1.b[1]\n"
+      "udot z19.s, z6.b, z0.b[1]\n"
+      "udot z23.s, z6.b, z1.b[1]\n"
+      "ble 26f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "udot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z20.s, z7.b, z1.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z17.s, z8.b, z0.b[2]\n"
+      "addvl x11, x11, #4\n"
+      "udot z21.s, z8.b, z1.b[2]\n"
+      "udot z18.s, z9.b, z0.b[2]\n"
+      "udot z22.s, z9.b, z1.b[2]\n"
+      "udot z19.s, z10.b, z0.b[2]\n"
+      "udot z23.s, z10.b, z1.b[2]\n"
+      "ble 26f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "udot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "udot z20.s, z4.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z17.s, z5.b, z0.b[3]\n"
+      "addvl x11, x11, #4\n"
+      "udot z21.s, z5.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z0.b[3]\n"
+      "udot z22.s, z6.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z0.b[3]\n"
+      "udot z23.s, z7.b, z1.b[3]\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 27f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z12.s, z1.b, z15.b\n"
+      "27:"  // Height 2: Multiply loop: unique 4: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 20b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbnz %x[flags], #31, 28f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z2.s }, p2/Z, [x19]\n"
+      "neg z2.s, p2/M, z2.s\n"
+      "mov x20, #0x4\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "uaddv d11, p0, z11.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "uaddv d12, p0, z12.s\n"
+      "mov z11.s, z11.s[0]\n"
+      "mov z12.s, z12.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z2.s\n"
+      "mul z12.s, p2/M, z12.s, z2.s\n"
+      "28:"  // Height 2: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z20.s, z20.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z21.s, z21.s, z12.s\n"
+      "add z22.s, z22.s, z12.s\n"
+      "add z23.s, z23.s, z12.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      "tbz %x[flags], #5, 29f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "and z8.d, z20.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "and z9.d, z21.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z20.s, z20.s, z8.s\n"
+      "sqadd z21.s, z21.s, z9.s\n"
+      "sqadd z22.s, z22.s, z10.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "29:"  // Height 2: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "30:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 18b\n"
+      "b 62f\n"
+      "31:"  // Height 3
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 32f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "b 33f\n"
+      "32:"  // Height 3: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "33:"  // Height 3: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x28, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "37:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "ble 40f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z4.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z5.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z20.s, z4.b, z1.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      "udot z24.s, z4.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "udot z25.s, z5.b, z2.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "udot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "udot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "udot z26.s, z6.b, z2.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "udot z19.s, z7.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "udot z23.s, z7.b, z1.b[0]\n"
+      "udot z27.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "udot z16.s, z8.b, z0.b[1]\n"
+      "udot z20.s, z8.b, z1.b[1]\n"
+      "udot z24.s, z8.b, z2.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "udot z17.s, z9.b, z0.b[1]\n"
+      "udot z21.s, z9.b, z1.b[1]\n"
+      "udot z25.s, z9.b, z2.b[1]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "udot z18.s, z10.b, z0.b[1]\n"
+      "udot z22.s, z10.b, z1.b[1]\n"
+      "udot z26.s, z10.b, z2.b[1]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "udot z19.s, z4.b, z0.b[1]\n"
+      "udot z23.s, z4.b, z1.b[1]\n"
+      "udot z27.s, z4.b, z2.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "udot z16.s, z5.b, z0.b[2]\n"
+      "udot z20.s, z5.b, z1.b[2]\n"
+      "udot z24.s, z5.b, z2.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "udot z17.s, z6.b, z0.b[2]\n"
+      "udot z21.s, z6.b, z1.b[2]\n"
+      "udot z25.s, z6.b, z2.b[2]\n"
+      "udot z18.s, z7.b, z0.b[2]\n"
+      "udot z22.s, z7.b, z1.b[2]\n"
+      "udot z26.s, z7.b, z2.b[2]\n"
+      "udot z19.s, z8.b, z0.b[2]\n"
+      "udot z23.s, z8.b, z1.b[2]\n"
+      "udot z27.s, z8.b, z2.b[2]\n"
+      "udot z16.s, z9.b, z0.b[3]\n"
+      "udot z20.s, z9.b, z1.b[3]\n"
+      "udot z24.s, z9.b, z2.b[3]\n"
+      "udot z17.s, z10.b, z0.b[3]\n"
+      "udot z21.s, z10.b, z1.b[3]\n"
+      "udot z25.s, z10.b, z2.b[3]\n"
+      "udot z18.s, z4.b, z0.b[3]\n"
+      "udot z22.s, z4.b, z1.b[3]\n"
+      "udot z26.s, z4.b, z2.b[3]\n"
+      "udot z19.s, z5.b, z0.b[3]\n"
+      "udot z23.s, z5.b, z1.b[3]\n"
+      "udot z27.s, z5.b, z2.b[3]\n"
+      "tbnz %x[flags], #31, 39f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z12.s, z1.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "39:"  // Height 3: Multiply loop: unique 5: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "bgt 38b\n"
+      "40:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z20.s, z6.b, z1.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      "udot z24.s, z6.b, z2.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z21.s, z7.b, z1.b[0]\n"
+      "udot z25.s, z7.b, z2.b[0]\n"
+      "udot z18.s, z8.b, z0.b[0]\n"
+      "udot z22.s, z8.b, z1.b[0]\n"
+      "udot z26.s, z8.b, z2.b[0]\n"
+      "udot z19.s, z9.b, z0.b[0]\n"
+      "udot z23.s, z9.b, z1.b[0]\n"
+      "udot z27.s, z9.b, z2.b[0]\n"
+      "ble 41f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "udot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z20.s, z10.b, z1.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z24.s, z10.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z17.s, z4.b, z0.b[1]\n"
+      "udot z21.s, z4.b, z1.b[1]\n"
+      "udot z25.s, z4.b, z2.b[1]\n"
+      "udot z18.s, z5.b, z0.b[1]\n"
+      "udot z22.s, z5.b, z1.b[1]\n"
+      "udot z26.s, z5.b, z2.b[1]\n"
+      "udot z19.s, z6.b, z0.b[1]\n"
+      "udot z23.s, z6.b, z1.b[1]\n"
+      "udot z27.s, z6.b, z2.b[1]\n"
+      "ble 41f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "udot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z20.s, z7.b, z1.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z24.s, z7.b, z2.b[2]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z17.s, z8.b, z0.b[2]\n"
+      "udot z21.s, z8.b, z1.b[2]\n"
+      "udot z25.s, z8.b, z2.b[2]\n"
+      "udot z18.s, z9.b, z0.b[2]\n"
+      "udot z22.s, z9.b, z1.b[2]\n"
+      "udot z26.s, z9.b, z2.b[2]\n"
+      "udot z19.s, z10.b, z0.b[2]\n"
+      "udot z23.s, z10.b, z1.b[2]\n"
+      "udot z27.s, z10.b, z2.b[2]\n"
+      "ble 41f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "udot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "udot z20.s, z4.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z24.s, z4.b, z2.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z17.s, z5.b, z0.b[3]\n"
+      "udot z21.s, z5.b, z1.b[3]\n"
+      "udot z25.s, z5.b, z2.b[3]\n"
+      "udot z18.s, z6.b, z0.b[3]\n"
+      "udot z22.s, z6.b, z1.b[3]\n"
+      "udot z26.s, z6.b, z2.b[3]\n"
+      "udot z19.s, z7.b, z0.b[3]\n"
+      "udot z23.s, z7.b, z1.b[3]\n"
+      "udot z27.s, z7.b, z2.b[3]\n"
+      "41:"  // Height 3: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 42f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z12.s, z1.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "42:"  // Height 3: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 35b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbnz %x[flags], #31, 43f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z3.s }, p2/Z, [x19]\n"
+      "neg z3.s, p2/M, z3.s\n"
+      "mov x20, #0x4\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "uaddv d11, p0, z11.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "uaddv d12, p0, z12.s\n"
+      "mov x19, #0x4\n"
+      "mov z11.s, z11.s[0]\n"
+      "whilelt p0.s, XZR, x19\n"
+      "mov z12.s, z12.s[0]\n"
+      "uaddv d13, p0, z13.s\n"
+      "mul z11.s, p2/M, z11.s, z3.s\n"
+      "mul z12.s, p2/M, z12.s, z3.s\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z13.s, p2/M, z13.s, z3.s\n"
+      "43:"  // Height 3: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z20.s, z20.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z21.s, z21.s, z12.s\n"
+      "add z22.s, z22.s, z12.s\n"
+      "add z23.s, z23.s, z12.s\n"
+      "add z24.s, z24.s, z13.s\n"
+      "add z25.s, z25.s, z13.s\n"
+      "add z26.s, z26.s, z13.s\n"
+      "add z27.s, z27.s, z13.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
+      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
+      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "tbz %x[flags], #5, 44f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "and z8.d, z20.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "and z9.d, z21.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z20.s, z20.s, z8.s\n"
+      "sqadd z21.s, z21.s, z9.s\n"
+      "sqadd z22.s, z22.s, z10.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "and z6.d, z25.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z5.s\n"
+      "and z7.d, z26.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z8.d, z27.d, z0.d\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z26.s, z26.s, z7.s\n"
+      "sqadd z27.s, z27.s, z8.s\n"
+      "44:"  // Height 3: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "add z26.s, z26.s, z4.s\n"
+      "addvl x25, x25, #1\n"
+      "add z27.s, z27.s, z4.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "addvl x23, x23, #1\n"
+      "45:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 33b\n"
+      "b 62f\n"
+      "46:"  // Height 4
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 47f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "ldr x21, [%x[output_ptr], #0x18]\n"
+      "add x25, x25, x19\n"
+      "add %x[output_ptr], %x[output_ptr], #0x20\n"
+      "add x23, x23, x19\n"
+      "add x21, x21, x19\n"
+      "b 48f\n"
+      "47:"  // Height 4: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "add x21, x23, x19\n"
+      "add %x[output_ptr], x21, x19\n"
+      "48:"  // Height 4: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "49:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "50:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 51f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x28, 52f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 52f\n"
+      "51:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "52:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "ble 55f\n"
+      "53:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z4.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z5.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z20.s, z4.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "udot z24.s, z4.b, z2.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "udot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z25.s, z5.b, z2.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "udot z28.s, z4.b, z3.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "udot z29.s, z5.b, z3.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "udot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "udot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "udot z26.s, z6.b, z2.b[0]\n"
+      "udot z30.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "udot z19.s, z7.b, z0.b[0]\n"
+      "udot z23.s, z7.b, z1.b[0]\n"
+      "udot z27.s, z7.b, z2.b[0]\n"
+      "udot z31.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "udot z16.s, z8.b, z0.b[1]\n"
+      "udot z20.s, z8.b, z1.b[1]\n"
+      "udot z24.s, z8.b, z2.b[1]\n"
+      "udot z28.s, z8.b, z3.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "udot z17.s, z9.b, z0.b[1]\n"
+      "udot z21.s, z9.b, z1.b[1]\n"
+      "udot z25.s, z9.b, z2.b[1]\n"
+      "udot z29.s, z9.b, z3.b[1]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "udot z18.s, z10.b, z0.b[1]\n"
+      "udot z22.s, z10.b, z1.b[1]\n"
+      "udot z26.s, z10.b, z2.b[1]\n"
+      "udot z30.s, z10.b, z3.b[1]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "udot z19.s, z4.b, z0.b[1]\n"
+      "udot z23.s, z4.b, z1.b[1]\n"
+      "udot z27.s, z4.b, z2.b[1]\n"
+      "udot z31.s, z4.b, z3.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "udot z16.s, z5.b, z0.b[2]\n"
+      "udot z20.s, z5.b, z1.b[2]\n"
+      "udot z24.s, z5.b, z2.b[2]\n"
+      "udot z28.s, z5.b, z3.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "udot z17.s, z6.b, z0.b[2]\n"
+      "udot z21.s, z6.b, z1.b[2]\n"
+      "udot z25.s, z6.b, z2.b[2]\n"
+      "udot z29.s, z6.b, z3.b[2]\n"
+      "udot z18.s, z7.b, z0.b[2]\n"
+      "udot z22.s, z7.b, z1.b[2]\n"
+      "udot z26.s, z7.b, z2.b[2]\n"
+      "udot z30.s, z7.b, z3.b[2]\n"
+      "udot z19.s, z8.b, z0.b[2]\n"
+      "udot z23.s, z8.b, z1.b[2]\n"
+      "udot z27.s, z8.b, z2.b[2]\n"
+      "udot z31.s, z8.b, z3.b[2]\n"
+      "udot z16.s, z9.b, z0.b[3]\n"
+      "udot z20.s, z9.b, z1.b[3]\n"
+      "udot z24.s, z9.b, z2.b[3]\n"
+      "udot z28.s, z9.b, z3.b[3]\n"
+      "udot z17.s, z10.b, z0.b[3]\n"
+      "udot z21.s, z10.b, z1.b[3]\n"
+      "udot z25.s, z10.b, z2.b[3]\n"
+      "udot z29.s, z10.b, z3.b[3]\n"
+      "udot z18.s, z4.b, z0.b[3]\n"
+      "udot z22.s, z4.b, z1.b[3]\n"
+      "udot z26.s, z4.b, z2.b[3]\n"
+      "udot z30.s, z4.b, z3.b[3]\n"
+      "udot z19.s, z5.b, z0.b[3]\n"
+      "udot z23.s, z5.b, z1.b[3]\n"
+      "udot z27.s, z5.b, z2.b[3]\n"
+      "udot z31.s, z5.b, z3.b[3]\n"
+      "tbnz %x[flags], #31, 54f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z12.s, z1.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "udot z14.s, z3.b, z15.b\n"
+      "54:"  // Height 4: Multiply loop: unique 7: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 53b\n"
+      "55:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z20.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "udot z24.s, z6.b, z2.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "udot z21.s, z7.b, z1.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z28.s, z6.b, z3.b[0]\n"
+      "udot z25.s, z7.b, z2.b[0]\n"
+      "udot z29.s, z7.b, z3.b[0]\n"
+      "udot z18.s, z8.b, z0.b[0]\n"
+      "udot z22.s, z8.b, z1.b[0]\n"
+      "udot z26.s, z8.b, z2.b[0]\n"
+      "udot z30.s, z8.b, z3.b[0]\n"
+      "udot z19.s, z9.b, z0.b[0]\n"
+      "udot z23.s, z9.b, z1.b[0]\n"
+      "udot z27.s, z9.b, z2.b[0]\n"
+      "udot z31.s, z9.b, z3.b[0]\n"
+      "ble 56f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "udot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z20.s, z10.b, z1.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z24.s, z10.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z28.s, z10.b, z3.b[1]\n"
+      "udot z17.s, z4.b, z0.b[1]\n"
+      "udot z21.s, z4.b, z1.b[1]\n"
+      "udot z25.s, z4.b, z2.b[1]\n"
+      "udot z29.s, z4.b, z3.b[1]\n"
+      "udot z18.s, z5.b, z0.b[1]\n"
+      "udot z22.s, z5.b, z1.b[1]\n"
+      "udot z26.s, z5.b, z2.b[1]\n"
+      "udot z30.s, z5.b, z3.b[1]\n"
+      "udot z19.s, z6.b, z0.b[1]\n"
+      "udot z23.s, z6.b, z1.b[1]\n"
+      "udot z27.s, z6.b, z2.b[1]\n"
+      "udot z31.s, z6.b, z3.b[1]\n"
+      "ble 56f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "udot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z20.s, z7.b, z1.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z24.s, z7.b, z2.b[2]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z28.s, z7.b, z3.b[2]\n"
+      "udot z17.s, z8.b, z0.b[2]\n"
+      "udot z21.s, z8.b, z1.b[2]\n"
+      "udot z25.s, z8.b, z2.b[2]\n"
+      "udot z29.s, z8.b, z3.b[2]\n"
+      "udot z18.s, z9.b, z0.b[2]\n"
+      "udot z22.s, z9.b, z1.b[2]\n"
+      "udot z26.s, z9.b, z2.b[2]\n"
+      "udot z30.s, z9.b, z3.b[2]\n"
+      "udot z19.s, z10.b, z0.b[2]\n"
+      "udot z23.s, z10.b, z1.b[2]\n"
+      "udot z27.s, z10.b, z2.b[2]\n"
+      "udot z31.s, z10.b, z3.b[2]\n"
+      "ble 56f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "udot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "udot z20.s, z4.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z24.s, z4.b, z2.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z28.s, z4.b, z3.b[3]\n"
+      "udot z17.s, z5.b, z0.b[3]\n"
+      "udot z21.s, z5.b, z1.b[3]\n"
+      "udot z25.s, z5.b, z2.b[3]\n"
+      "udot z29.s, z5.b, z3.b[3]\n"
+      "udot z18.s, z6.b, z0.b[3]\n"
+      "udot z22.s, z6.b, z1.b[3]\n"
+      "udot z26.s, z6.b, z2.b[3]\n"
+      "udot z30.s, z6.b, z3.b[3]\n"
+      "udot z19.s, z7.b, z0.b[3]\n"
+      "udot z23.s, z7.b, z1.b[3]\n"
+      "udot z27.s, z7.b, z2.b[3]\n"
+      "udot z31.s, z7.b, z3.b[3]\n"
+      "56:"  // Height 4: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 57f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z12.s, z1.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "udot z14.s, z3.b, z15.b\n"
+      "57:"  // Height 4: Multiply loop: unique 8: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 50b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbnz %x[flags], #31, 58f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "neg z4.s, p2/M, z4.s\n"
+      "mov x20, #0x4\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "uaddv d11, p0, z11.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "uaddv d12, p0, z12.s\n"
+      "mov x19, #0x4\n"
+      "mov z11.s, z11.s[0]\n"
+      "whilelt p0.s, XZR, x19\n"
+      "mov x19, #0x4\n"
+      "mov z12.s, z12.s[0]\n"
+      "uaddv d13, p0, z13.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "mul z11.s, p2/M, z11.s, z4.s\n"
+      "uaddv d14, p0, z14.s\n"
+      "mul z12.s, p2/M, z12.s, z4.s\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z13.s, p2/M, z13.s, z4.s\n"
+      "mov z14.s, z14.s[0]\n"
+      "mul z14.s, p2/M, z14.s, z4.s\n"
+      "58:"  // Height 4: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z20.s, z20.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z21.s, z21.s, z12.s\n"
+      "add z22.s, z22.s, z12.s\n"
+      "add z23.s, z23.s, z12.s\n"
+      "add z24.s, z24.s, z13.s\n"
+      "add z25.s, z25.s, z13.s\n"
+      "add z26.s, z26.s, z13.s\n"
+      "add z27.s, z27.s, z13.s\n"
+      "add z28.s, z28.s, z14.s\n"
+      "add z29.s, z29.s, z14.s\n"
+      "add z30.s, z30.s, z14.s\n"
+      "add z31.s, z31.s, z14.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      "add z28.s, z28.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z29.s, z29.s, z1.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z31.s, z31.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
+      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
+      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
+      ".inst 0x04a477bd  // sqrdmulh z29.s, z29.s, z4.s\n"
+      ".inst 0x04a477de  // sqrdmulh z30.s, z30.s, z4.s\n"
+      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      "tbz %x[flags], #5, 59f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "and z8.d, z20.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "and z9.d, z21.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z20.s, z20.s, z8.s\n"
+      "sqadd z21.s, z21.s, z9.s\n"
+      "sqadd z22.s, z22.s, z10.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "and z6.d, z25.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z5.s\n"
+      "and z7.d, z26.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z8.d, z27.d, z0.d\n"
+      "and z9.d, z28.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "and z10.d, z29.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "and z4.d, z30.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z26.s, z26.s, z7.s\n"
+      "and z5.d, z31.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z27.s, z27.s, z8.s\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z9.s\n"
+      "sqadd z29.s, z29.s, z10.s\n"
+      "sqadd z30.s, z30.s, z4.s\n"
+      "sqadd z31.s, z31.s, z5.s\n"
+      "59:"  // Height 4: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "add z26.s, z26.s, z4.s\n"
+      "addvl x25, x25, #1\n"
+      "add z27.s, z27.s, z4.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "add z28.s, z28.s, z4.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "smin z28.s, p2/M, z28.s, z6.s\n"
+      ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
+      ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "smax z28.s, p2/M, z28.s, z5.s\n"
+      "add z29.s, z29.s, z4.s\n"
+      "add z30.s, z30.s, z4.s\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "smin z29.s, p2/M, z29.s, z6.s\n"
+      "addvl x23, x23, #1\n"
+      "smin z30.s, p2/M, z30.s, z6.s\n"
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "smax z29.s, p2/M, z29.s, z5.s\n"
+      "add z31.s, z31.s, z4.s\n"
+      "smax z30.s, p2/M, z30.s, z5.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      "smin z31.s, p2/M, z31.s, z6.s\n"
+      "smax z31.s, p2/M, z31.s, z5.s\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p1, [x21]\n"
+      "addvl x21, x21, #1\n"
+      "60:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 48b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 62f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "61:"  // Update direct input
+      "mov x19, #0x4\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "62:"  // Exit
+
+      : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
deleted file mode 100644
index 565832e8de..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2137 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long leftovers = K;
-    const long blocks_count = (K + 3) / 4;
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const uint8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(uint8_t);
-
-        uint32_t *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const uint8_t *a_ptr0 = a_ptr0_base;
-            const uint8_t *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(uint32_t);
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z18.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z19.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "mov z18.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z19.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z20.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z21.s, #0\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "mov z18.s, #0\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "mov z19.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z20.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z21.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z24.s, #0\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z25.s, #0\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z26.s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z27.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "mov z18.s, #0\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "mov z19.s, #0\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                        "mov z20.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z21.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z24.s, #0\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z25.s, #0\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z26.s, #0\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z27.s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z28.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "mov z29.s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "mov z30.s, #0\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z31.s, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1w z28.s, p0/z, [c_ptr3]\n"
-                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z28.s, z8.b, z3.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "udot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "udot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "udot z31.s, z11.b, z3.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z28.s, z12.b, z3.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "udot z31.s, z15.b, z3.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z28.s, z8.b, z3.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z30.s, z10.b, z3.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z31.s, z11.b, z3.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z28.s, z12.b, z3.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z30.s, z14.b, z3.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
-                        "udot z31.s, z15.b, z3.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "udot z28.s, z8.b, z7.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z30.s, z10.b, z7.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "udot z31.s, z11.b, z7.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z28.s, z12.b, z7.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z30.s, z14.b, z7.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "udot z31.s, z15.b, z7.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z28.s, z8.b, z7.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z30.s, z10.b, z7.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "udot z31.s, z11.b, z7.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z28.s, z12.b, z7.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z29.s, z13.b, z7.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z30.s, z14.b, z7.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "udot z31.s, z15.b, z7.b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z28.s, z8.b, z3.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "udot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "udot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "udot z31.s, z11.b, z3.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z28.s, z12.b, z3.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "udot z31.s, z15.b, z3.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z28.s, z8.b, z3.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z30.s, z10.b, z3.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z31.s, z11.b, z3.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z28.s, z12.b, z3.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z30.s, z14.b, z3.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                        "udot z31.s, z15.b, z3.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "udot z28.s, z8.b, z7.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z30.s, z10.b, z7.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "udot z31.s, z11.b, z7.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z28.s, z12.b, z7.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z30.s, z14.b, z7.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "udot z31.s, z15.b, z7.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z28.s, z8.b, z7.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z30.s, z10.b, z7.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "udot z31.s, z11.b, z7.b[2]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z28.s, z12.b, z7.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z29.s, z13.b, z7.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z30.s, z14.b, z7.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "udot z31.s, z15.b, z7.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "udot z28.s, z8.b, z3.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "udot z29.s, z9.b, z3.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "udot z30.s, z10.b, z3.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "udot z31.s, z11.b, z3.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z28.s, z12.b, z3.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z30.s, z14.b, z3.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "udot z31.s, z15.b, z3.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z28.s, z8.b, z3.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z30.s, z10.b, z3.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z31.s, z11.b, z3.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z28.s, z12.b, z3.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z30.s, z14.b, z3.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "udot z31.s, z15.b, z3.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "udot z28.s, z8.b, z3.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "udot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        "udot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "udot z31.s, z11.b, z3.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z28.s, z12.b, z3.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "udot z31.s, z15.b, z3.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z28.s, z8.b, z3.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z30.s, z10.b, z3.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z31.s, z11.b, z3.b[2]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z28.s, z12.b, z3.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z30.s, z14.b, z3.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "udot z31.s, z15.b, z3.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "udot z28.s, z8.b, z7.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z30.s, z10.b, z7.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "udot z31.s, z11.b, z7.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z28.s, z12.b, z7.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z30.s, z14.b, z7.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "udot z31.s, z15.b, z7.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z28.s, z8.b, z7.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z30.s, z10.b, z7.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "udot z31.s, z11.b, z7.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z28.s, z12.b, z7.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z29.s, z13.b, z7.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z30.s, z14.b, z7.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "udot z31.s, z15.b, z7.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z28.s, p0, [c_ptr3]\n"
-                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
new file mode 100644
index 0000000000..af9de4a6eb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<uint8_t>, \
+   size_t, size_t, \
+   const uint8_t *, \
+   IndirectOutputArg<uint32_t>, \
+   const uint32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_u8u32_dot_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_u8u32_dot_6x4VL
+{
+public:
+    typedef uint8_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<uint32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_u8u32_dot_6x4VL;
+
+    cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..fc8ce636dd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
@@ -0,0 +1,1904 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8u32_dot_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+    const uint32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 61f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 49f\n"
+      "beq 37f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 25f\n"
+      "beq 13f\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 8f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "cmp x11, #0x10\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "ble 11f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "addvl x14, x14, #4\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "ble 11f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "addvl x14, x14, #4\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "ble 11f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 6b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "12:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 3b\n"
+      "b 74f\n"
+      "13:"  // Height 2
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 14f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 15f\n"
+      "14:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "15:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 16f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "b 17f\n"
+      "16:"  // Height 2: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 20f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "20:"  // Height 2: input setup done
+      "cmp x11, #0x10\n"
+      "ble 22f\n"
+      "21:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "bgt 21b\n"
+      "22:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "ble 23f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "ble 23f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "ble 23f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "23:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 18b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "24:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 15b\n"
+      "b 74f\n"
+      "25:"  // Height 3
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 26f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 27f\n"
+      "26:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "27:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 28f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "b 29f\n"
+      "28:"  // Height 3: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "29:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "30:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 31f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 32f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "b 32f\n"
+      "31:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "32:"  // Height 3: input setup done
+      "cmp x11, #0x10\n"
+      "ble 34f\n"
+      "33:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "bgt 33b\n"
+      "34:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "add x26, x26, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "ble 35f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "ble 35f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "ble 35f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "35:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 30b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "36:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 27b\n"
+      "b 74f\n"
+      "37:"  // Height 4
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 38f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 39f\n"
+      "38:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "39:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 40f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "b 41f\n"
+      "40:"  // Height 4: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "41:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "42:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 43f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 44f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 44f\n"
+      "43:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "44:"  // Height 4: input setup done
+      "cmp x11, #0x10\n"
+      "ble 46f\n"
+      "45:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "udot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "udot z21.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z22.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "udot z23.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "udot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "udot z21.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z22.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "udot z23.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "udot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "udot z21.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z22.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "udot z23.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "udot z20.s, z6.b, z3.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "udot z21.s, z7.b, z3.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z22.s, z6.b, z3.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z23.s, z7.b, z3.b[3]\n"
+      "bgt 45b\n"
+      "46:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "add x24, x24, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "udot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z21.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z22.s, z6.b, z3.b[0]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "udot z23.s, z7.b, z3.b[0]\n"
+      "ble 47f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "udot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "udot z21.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z22.s, z6.b, z3.b[1]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "udot z23.s, z7.b, z3.b[1]\n"
+      "ble 47f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "udot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "udot z21.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z22.s, z6.b, z3.b[2]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "udot z23.s, z7.b, z3.b[2]\n"
+      "ble 47f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "udot z20.s, z6.b, z3.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "udot z21.s, z7.b, z3.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z22.s, z6.b, z3.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z23.s, z7.b, z3.b[3]\n"
+      "47:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 42b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "48:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 39b\n"
+      "b 74f\n"
+      "49:"  // Height 5
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 50f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 51f\n"
+      "50:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "51:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 52f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "b 53f\n"
+      "52:"  // Height 5: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "53:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "54:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 55f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 56f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 56f\n"
+      "55:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "56:"  // Height 5: input setup done
+      "cmp x11, #0x10\n"
+      "ble 58f\n"
+      "57:"  // Height 5: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "udot z20.s, z6.b, z3.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "udot z24.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "udot z21.s, z7.b, z3.b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "udot z25.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z22.s, z6.b, z3.b[0]\n"
+      "udot z26.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "udot z23.s, z7.b, z3.b[0]\n"
+      "udot z27.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "udot z20.s, z6.b, z3.b[1]\n"
+      "udot z24.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "udot z21.s, z7.b, z3.b[1]\n"
+      "udot z25.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z22.s, z6.b, z3.b[1]\n"
+      "udot z26.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "udot z23.s, z7.b, z3.b[1]\n"
+      "udot z27.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "udot z20.s, z6.b, z3.b[2]\n"
+      "udot z24.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "udot z21.s, z7.b, z3.b[2]\n"
+      "udot z25.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z22.s, z6.b, z3.b[2]\n"
+      "udot z26.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "udot z23.s, z7.b, z3.b[2]\n"
+      "udot z27.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "udot z20.s, z6.b, z3.b[3]\n"
+      "udot z24.s, z6.b, z4.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "udot z21.s, z7.b, z3.b[3]\n"
+      "udot z25.s, z7.b, z4.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z22.s, z6.b, z3.b[3]\n"
+      "udot z26.s, z6.b, z4.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z23.s, z7.b, z3.b[3]\n"
+      "udot z27.s, z7.b, z4.b[3]\n"
+      "bgt 57b\n"
+      "58:"  // Height 5: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "add x22, x22, #0x10\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "udot z20.s, z6.b, z3.b[0]\n"
+      "udot z24.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z21.s, z7.b, z3.b[0]\n"
+      "udot z25.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z22.s, z6.b, z3.b[0]\n"
+      "udot z26.s, z6.b, z4.b[0]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "udot z23.s, z7.b, z3.b[0]\n"
+      "udot z27.s, z7.b, z4.b[0]\n"
+      "ble 59f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "udot z20.s, z6.b, z3.b[1]\n"
+      "udot z24.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "udot z21.s, z7.b, z3.b[1]\n"
+      "udot z25.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z22.s, z6.b, z3.b[1]\n"
+      "udot z26.s, z6.b, z4.b[1]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "udot z23.s, z7.b, z3.b[1]\n"
+      "udot z27.s, z7.b, z4.b[1]\n"
+      "ble 59f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "udot z20.s, z6.b, z3.b[2]\n"
+      "udot z24.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "udot z21.s, z7.b, z3.b[2]\n"
+      "udot z25.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z22.s, z6.b, z3.b[2]\n"
+      "udot z26.s, z6.b, z4.b[2]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "udot z23.s, z7.b, z3.b[2]\n"
+      "udot z27.s, z7.b, z4.b[2]\n"
+      "ble 59f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "udot z20.s, z6.b, z3.b[3]\n"
+      "udot z24.s, z6.b, z4.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "udot z21.s, z7.b, z3.b[3]\n"
+      "udot z25.s, z7.b, z4.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z22.s, z6.b, z3.b[3]\n"
+      "udot z26.s, z6.b, z4.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z23.s, z7.b, z3.b[3]\n"
+      "udot z27.s, z7.b, z4.b[3]\n"
+      "59:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 54b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "60:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 51b\n"
+      "b 74f\n"
+      "61:"  // Height 6
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 62f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 63f\n"
+      "62:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "63:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 64f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x21]\n"
+      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "b 65f\n"
+      "64:"  // Height 6: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "65:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "66:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 68f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 68f\n"
+      "67:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "68:"  // Height 6: input setup done
+      "cmp x11, #0x10\n"
+      "ble 70f\n"
+      "69:"  // Height 6: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "udot z20.s, z6.b, z3.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "udot z24.s, z6.b, z4.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "udot z28.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "udot z21.s, z7.b, z3.b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "udot z25.s, z7.b, z4.b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "udot z29.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z22.s, z6.b, z3.b[0]\n"
+      "udot z26.s, z6.b, z4.b[0]\n"
+      "udot z30.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "udot z23.s, z7.b, z3.b[0]\n"
+      "udot z27.s, z7.b, z4.b[0]\n"
+      "udot z31.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "udot z20.s, z6.b, z3.b[1]\n"
+      "udot z24.s, z6.b, z4.b[1]\n"
+      "udot z28.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "udot z21.s, z7.b, z3.b[1]\n"
+      "udot z25.s, z7.b, z4.b[1]\n"
+      "udot z29.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z22.s, z6.b, z3.b[1]\n"
+      "udot z26.s, z6.b, z4.b[1]\n"
+      "udot z30.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "udot z23.s, z7.b, z3.b[1]\n"
+      "udot z27.s, z7.b, z4.b[1]\n"
+      "udot z31.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "udot z20.s, z6.b, z3.b[2]\n"
+      "udot z24.s, z6.b, z4.b[2]\n"
+      "udot z28.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "udot z21.s, z7.b, z3.b[2]\n"
+      "udot z25.s, z7.b, z4.b[2]\n"
+      "udot z29.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z22.s, z6.b, z3.b[2]\n"
+      "udot z26.s, z6.b, z4.b[2]\n"
+      "udot z30.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "udot z23.s, z7.b, z3.b[2]\n"
+      "udot z27.s, z7.b, z4.b[2]\n"
+      "udot z31.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "udot z20.s, z6.b, z3.b[3]\n"
+      "udot z24.s, z6.b, z4.b[3]\n"
+      "udot z28.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "udot z21.s, z7.b, z3.b[3]\n"
+      "udot z25.s, z7.b, z4.b[3]\n"
+      "udot z29.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z22.s, z6.b, z3.b[3]\n"
+      "udot z26.s, z6.b, z4.b[3]\n"
+      "udot z30.s, z6.b, z5.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z23.s, z7.b, z3.b[3]\n"
+      "udot z27.s, z7.b, z4.b[3]\n"
+      "udot z31.s, z7.b, z5.b[3]\n"
+      "bgt 69b\n"
+      "70:"  // Height 6: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "udot z20.s, z6.b, z3.b[0]\n"
+      "add x20, x20, #0x10\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "udot z24.s, z6.b, z4.b[0]\n"
+      "udot z28.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z21.s, z7.b, z3.b[0]\n"
+      "udot z25.s, z7.b, z4.b[0]\n"
+      "udot z29.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z22.s, z6.b, z3.b[0]\n"
+      "udot z26.s, z6.b, z4.b[0]\n"
+      "udot z30.s, z6.b, z5.b[0]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "udot z23.s, z7.b, z3.b[0]\n"
+      "udot z27.s, z7.b, z4.b[0]\n"
+      "udot z31.s, z7.b, z5.b[0]\n"
+      "ble 71f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "udot z20.s, z6.b, z3.b[1]\n"
+      "udot z24.s, z6.b, z4.b[1]\n"
+      "udot z28.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "udot z21.s, z7.b, z3.b[1]\n"
+      "udot z25.s, z7.b, z4.b[1]\n"
+      "udot z29.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z22.s, z6.b, z3.b[1]\n"
+      "udot z26.s, z6.b, z4.b[1]\n"
+      "udot z30.s, z6.b, z5.b[1]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "udot z23.s, z7.b, z3.b[1]\n"
+      "udot z27.s, z7.b, z4.b[1]\n"
+      "udot z31.s, z7.b, z5.b[1]\n"
+      "ble 71f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "udot z20.s, z6.b, z3.b[2]\n"
+      "udot z24.s, z6.b, z4.b[2]\n"
+      "udot z28.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "udot z21.s, z7.b, z3.b[2]\n"
+      "udot z25.s, z7.b, z4.b[2]\n"
+      "udot z29.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z22.s, z6.b, z3.b[2]\n"
+      "udot z26.s, z6.b, z4.b[2]\n"
+      "udot z30.s, z6.b, z5.b[2]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "udot z23.s, z7.b, z3.b[2]\n"
+      "udot z27.s, z7.b, z4.b[2]\n"
+      "udot z31.s, z7.b, z5.b[2]\n"
+      "ble 71f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "udot z20.s, z6.b, z3.b[3]\n"
+      "udot z24.s, z6.b, z4.b[3]\n"
+      "udot z28.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "udot z21.s, z7.b, z3.b[3]\n"
+      "udot z25.s, z7.b, z4.b[3]\n"
+      "udot z29.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z22.s, z6.b, z3.b[3]\n"
+      "udot z26.s, z6.b, z4.b[3]\n"
+      "udot z30.s, z6.b, z5.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z23.s, z7.b, z3.b[3]\n"
+      "udot z27.s, z7.b, z4.b[3]\n"
+      "udot z31.s, z7.b, z5.b[3]\n"
+      "71:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 66b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "st1w { z28.s }, p4, [x21]\n"
+      "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+      "addvl x21, x21, #4\n"
+      "72:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 63b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 74f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 73f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "73:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "74:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
index 43107e45fa..12bb758b68 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *, const bfloat16 *, float *, int, int, int);
 
-class interleaved_bf16fp32_dot_3VLx8 {
+class cls_sve_interleaved_bf16fp32_dot_8x3VL {
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
@@ -59,9 +59,9 @@ public:
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 3, 2, 1> transforms = {};
 
-    kern_type kernel=sve_interleaved_bf16fp32_dot_3VLx8;
+    kern_type kernel=sve_interleaved_bf16fp32_dot_8x3VL;
 
-    interleaved_bf16fp32_dot_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_bf16fp32_dot_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
index 7e20ed0971..adee900337 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const bfloat16 *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
index f1353e2086..2889dd7f0f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *, const bfloat16 *, float *, int, int, int);
 
-class interleaved_bf16fp32_mmla_3VLx8 {
+class cls_sve_interleaved_bf16fp32_mmla_8x3VL {
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
@@ -59,9 +59,9 @@ public:
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2> transforms = {};
 
-    kern_type kernel=sve_interleaved_bf16fp32_mmla_3VLx8;
+    kern_type kernel=sve_interleaved_bf16fp32_mmla_8x3VL;
 
-    interleaved_bf16fp32_mmla_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_bf16fp32_mmla_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
index 16cc69b2a6..e43404e608 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const bfloat16 *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
index 816c0cd095..eb946d9dfa 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_fp16_mla_3VLx8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void sve_interleaved_fp16_mla_8x3VL(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
 
-class interleaved_fp16_mla_3VLx8 {
+class cls_sve_interleaved_fp16_mla_8x3VL {
 public:
     typedef __fp16 operand_type;
     typedef __fp16 result_type;
@@ -59,9 +59,9 @@ public:
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
 
-    kern_type kernel=sve_interleaved_fp16_mla_3VLx8;
+    kern_type kernel=sve_interleaved_fp16_mla_8x3VL;
 
-    interleaved_fp16_mla_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_fp16_mla_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
index f2050cbd56..46b8770409 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_fp16_mla_8x3VL(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
     const __fp16 *a_ptr = Apanel;
     __fp16 *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
index cce90fb135..b84ba83b6a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_fp32_mla_3VLx8(const float *, const float *, float *, int, int, int);
+void sve_interleaved_fp32_mla_8x3VL(const float *, const float *, float *, int, int, int);
 
-class interleaved_fp32_mla_3VLx8 {
+class cls_sve_interleaved_fp32_mla_8x3VL {
 public:
     typedef float operand_type;
     typedef float result_type;
@@ -59,9 +59,9 @@ public:
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
 
-    kern_type kernel=sve_interleaved_fp32_mla_3VLx8;
+    kern_type kernel=sve_interleaved_fp32_mla_8x3VL;
 
-    interleaved_fp32_mla_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_fp32_mla_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
index cd178c478a..1e05a308b5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_fp32_mla_8x3VL(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
index 4ca43cd5c9..96216960ff 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_fp32_mmla_3VLx8(const float *, const float *, float *, int, int, int);
+void sve_interleaved_fp32_mmla_8x3VL(const float *, const float *, float *, int, int, int);
 
-class interleaved_fp32_mmla_3VLx8 {
+class cls_sve_interleaved_fp32_mmla_8x3VL {
 public:
     typedef float operand_type;
     typedef float result_type;
@@ -59,9 +59,9 @@ public:
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 6, 2, 2> transforms = {};
 
-    kern_type kernel=sve_interleaved_fp32_mmla_3VLx8;
+    kern_type kernel=sve_interleaved_fp32_mmla_8x3VL;
 
-    interleaved_fp32_mmla_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_fp32_mmla_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
index a404ae9c82..39daf0ff20 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_fp32_mmla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_fp32_mmla_8x3VL(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
index e40ba215b4..3e16915cd4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_s8s32_dot_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void sve_interleaved_s8s32_dot_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
-class interleaved_s8s32_dot_3VLx8 {
+class cls_sve_interleaved_s8s32_dot_8x3VL {
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
@@ -58,10 +58,11 @@ public:
 
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
 
-    kern_type kernel=sve_interleaved_s8s32_dot_3VLx8;
+    kern_type kernel=sve_interleaved_s8s32_dot_8x3VL;
 
-    interleaved_s8s32_dot_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
index cdc70705c5..674c2400bf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_s8s32_dot_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
     const int8_t *a_ptr = Apanel;
     int32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
index 361598d594..02b3451c54 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
-class interleaved_s8s32_mmla_3VLx8 {
+class cls_sve_interleaved_s8s32_mmla_8x3VL {
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
@@ -58,10 +58,11 @@ public:
 
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
 
-    kern_type kernel=sve_interleaved_s8s32_mmla_3VLx8;
+    kern_type kernel=sve_interleaved_s8s32_mmla_8x3VL;
 
-    interleaved_s8s32_mmla_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_s8s32_mmla_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
index cde9ec32e9..578aa01732 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
     const int8_t *a_ptr = Apanel;
     int32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
index 252f38ec63..832a224199 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 
-class interleaved_u8u32_dot_3VLx8 {
+class cls_sve_interleaved_u8u32_dot_8x3VL {
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
@@ -58,10 +58,11 @@ public:
 
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
 
-    kern_type kernel=sve_interleaved_u8u32_dot_3VLx8;
+    kern_type kernel=sve_interleaved_u8u32_dot_8x3VL;
 
-    interleaved_u8u32_dot_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
index 6626f8463b..891869c767 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
     const uint8_t *a_ptr = Apanel;
     uint32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
index ed44a9d8fc..4fdaab84bd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 
-class interleaved_u8u32_mmla_3VLx8 {
+class cls_sve_interleaved_u8u32_mmla_8x3VL {
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
@@ -58,10 +58,11 @@ public:
 
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
 
-    kern_type kernel=sve_interleaved_u8u32_mmla_3VLx8;
+    kern_type kernel=sve_interleaved_u8u32_mmla_8x3VL;
 
-    interleaved_u8u32_mmla_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_u8u32_mmla_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
index 81a1dbcf51..fa08a9d091 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
     const uint8_t *a_ptr = Apanel;
     uint32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp
index b555066195..2097d76a54 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp
@@ -31,9 +31,9 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_smallK_hybrid_fp32_mla_1VLx8(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_smallK_hybrid_fp32_mla_8x1VL(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
 
-class smallK_hybrid_fp32_mla_1VLx8
+class cls_sve_smallK_hybrid_fp32_mla_8x1VL
 {
 public:
     typedef float operand_type;
@@ -75,9 +75,9 @@ public:
     StdTransformsSVE<operand_type, result_type, 8, 1, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx8;
+    kern_type kernel=sve_smallK_hybrid_fp32_mla_8x1VL;
 
-    smallK_hybrid_fp32_mla_1VLx8(const CPUInfo *)
+    cls_sve_smallK_hybrid_fp32_mla_8x1VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
index 5501688054..e07cfa8218 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void sve_smallK_hybrid_fp32_mla_1VLx8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
+void sve_smallK_hybrid_fp32_mla_8x1VL(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
     const long loops_count = iceildiv(N, (int)get_vector_length<float>()) - 1;
     const long ldab = lda * sizeof(float);
     const long ldcb = ldc * sizeof(float);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp
index eef1e4cc65..e50c05ba39 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp
@@ -31,9 +31,9 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
 
-class smallK_hybrid_s8s32_dot_1VLx8
+class cls_sve_smallK_hybrid_s8s32_dot_8x1VL
 {
 public:
     typedef int8_t operand_type;
@@ -75,9 +75,9 @@ public:
     StdTransformsSVE<operand_type, result_type, 8, 1, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_smallK_hybrid_s8s32_dot_1VLx8;
+    kern_type kernel=sve_smallK_hybrid_s8s32_dot_8x1VL;
 
-    smallK_hybrid_s8s32_dot_1VLx8(const CPUInfo *)
+    cls_sve_smallK_hybrid_s8s32_dot_8x1VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
index e2fbdcb61b..5770076d04 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *bias, Activation act, bool) {
     const long loops_count = iceildiv(N, (int)get_vector_length<int32_t>()) - 1;
     const long ldab = lda * sizeof(int8_t);
     const long ldcb = ldc * sizeof(int32_t);
@@ -112,55 +112,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
                     "mov z27.s, #0\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "cbz %[loops], 2f\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
@@ -186,10 +185,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -201,6 +199,8 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
@@ -230,23 +230,34 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -325,112 +336,112 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "mov z28.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
@@ -470,23 +481,42 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -565,48 +595,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -618,49 +650,46 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "mov z28.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
@@ -676,7 +705,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -690,11 +718,12 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
@@ -720,8 +749,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
@@ -737,23 +767,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -832,50 +889,52 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
@@ -894,50 +953,47 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z19.b, z5.b[3]\n"
                     "sdot z30.s, z19.b, z6.b[3]\n"
                     "sdot z31.s, z19.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "mov z28.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
@@ -953,7 +1009,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -962,7 +1017,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
                     "sdot z26.s, z19.b, z2.b[3]\n"
@@ -976,14 +1030,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
@@ -1006,8 +1062,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
@@ -1031,23 +1088,58 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z19.b, z5.b[3]\n"
                     "sdot z30.s, z19.b, z6.b[3]\n"
                     "sdot z31.s, z19.b, z7.b[3]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1126,46 +1218,48 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
@@ -1205,84 +1299,79 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -1300,7 +1389,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -1313,39 +1401,41 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
@@ -1354,6 +1444,8 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -1392,23 +1484,82 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1487,48 +1638,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -1575,85 +1728,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -1671,7 +1819,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -1679,7 +1826,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -1693,47 +1839,52 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -1780,23 +1931,90 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1875,48 +2093,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
@@ -1972,86 +2192,81 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -2069,7 +2284,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -2077,7 +2291,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -2086,7 +2299,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -2100,47 +2312,135 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "b 5f\n"
+                    "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -2195,23 +2495,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2290,49 +2583,51 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
@@ -2396,87 +2691,82 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z23.b, z5.b[3]\n"
                     "sdot z30.s, z23.b, z6.b[3]\n"
                     "sdot z31.s, z23.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -2494,7 +2784,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -2502,7 +2791,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -2511,7 +2799,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -2520,7 +2807,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
                     "sdot z26.s, z23.b, z2.b[3]\n"
@@ -2534,47 +2820,144 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "b 5f\n"
+                    "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -2637,23 +3020,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z23.b, z5.b[3]\n"
                     "sdot z30.s, z23.b, z6.b[3]\n"
                     "sdot z31.s, z23.b, z7.b[3]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2732,54 +3108,56 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
@@ -2856,88 +3234,84 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -2955,7 +3329,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -2963,7 +3336,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -2972,7 +3344,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -2981,7 +3352,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
@@ -2999,7 +3369,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
@@ -3007,55 +3376,62 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
@@ -3133,23 +3509,124 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3228,52 +3705,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
@@ -3361,88 +3840,85 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -3460,7 +3936,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -3468,7 +3943,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -3477,7 +3951,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -3486,7 +3959,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
@@ -3504,7 +3976,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
@@ -3512,7 +3983,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -3521,53 +3991,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -3656,23 +4133,133 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3751,52 +4338,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
@@ -3893,82 +4482,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
@@ -3995,12 +4582,10 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z28.s, z20.b, z4.b[0]\n"
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -4009,7 +4594,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -4018,7 +4602,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
@@ -4036,7 +4619,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
@@ -4044,7 +4626,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -4053,7 +4634,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -4062,53 +4642,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -4206,23 +4793,142 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -4301,52 +5007,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
@@ -4452,82 +5160,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z19.b, z5.b[3]\n"
                     "sdot z30.s, z19.b, z6.b[3]\n"
                     "sdot z31.s, z19.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
@@ -4559,7 +5265,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -4568,7 +5273,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -4577,7 +5281,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
@@ -4595,7 +5298,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
@@ -4603,7 +5305,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -4612,7 +5313,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -4621,7 +5321,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
                     "sdot z26.s, z19.b, z2.b[3]\n"
@@ -4630,53 +5329,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z19.b, z5.b[3]\n"
                     "sdot z30.s, z19.b, z6.b[3]\n"
                     "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -4783,23 +5489,151 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z19.b, z5.b[3]\n"
                     "sdot z30.s, z19.b, z6.b[3]\n"
                     "sdot z31.s, z19.b, z7.b[3]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -4878,52 +5712,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
@@ -5046,82 +5882,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
@@ -5162,7 +5996,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -5171,7 +6004,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
@@ -5189,7 +6021,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
@@ -5197,7 +6028,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -5206,7 +6036,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -5215,7 +6044,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -5233,7 +6061,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -5241,53 +6068,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -5411,23 +6245,168 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -5506,52 +6485,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
@@ -5683,82 +6664,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
@@ -5808,7 +6787,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
@@ -5826,7 +6804,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
@@ -5834,7 +6811,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -5843,7 +6819,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -5852,7 +6827,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -5870,7 +6844,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -5878,7 +6851,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -5887,53 +6859,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -6066,23 +7045,177 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -6161,52 +7294,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
@@ -6347,82 +7482,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
@@ -6493,12 +7626,10 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -6507,7 +7638,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -6516,7 +7646,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -6534,7 +7663,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -6542,7 +7670,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -6551,7 +7678,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -6560,53 +7686,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -6748,23 +7881,186 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -6844,52 +8140,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
@@ -7039,83 +8337,81 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z23.b, z5.b[3]\n"
                     "sdot z30.s, z23.b, z6.b[3]\n"
                     "sdot z31.s, z23.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
@@ -7190,7 +8486,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -7199,7 +8494,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -7208,7 +8502,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -7226,7 +8519,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -7234,7 +8526,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -7243,7 +8534,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -7252,7 +8542,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
                     "sdot z26.s, z23.b, z2.b[3]\n"
@@ -7266,49 +8555,235 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "b 5f\n"
+                    "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
@@ -7458,23 +8933,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z23.b, z5.b[3]\n"
                     "sdot z30.s, z23.b, z6.b[3]\n"
                     "sdot z31.s, z23.b, z7.b[3]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp
index 70a0b12130..60184be043 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp
@@ -31,9 +31,9 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
 
-class smallK_hybrid_u8u32_dot_1VLx8
+class cls_sve_smallK_hybrid_u8u32_dot_8x1VL
 {
 public:
     typedef uint8_t operand_type;
@@ -75,9 +75,9 @@ public:
     StdTransformsSVE<operand_type, result_type, 8, 1, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_smallK_hybrid_u8u32_dot_1VLx8;
+    kern_type kernel=sve_smallK_hybrid_u8u32_dot_8x1VL;
 
-    smallK_hybrid_u8u32_dot_1VLx8(const CPUInfo *)
+    cls_sve_smallK_hybrid_u8u32_dot_8x1VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
index 1d0b84e788..b980d9b5c2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *bias, Activation act, bool) {
     const long loops_count = iceildiv(N, (int)get_vector_length<uint32_t>()) - 1;
     const long ldab = lda * sizeof(uint8_t);
     const long ldcb = ldc * sizeof(uint32_t);
@@ -112,55 +112,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
                     "mov z27.s, #0\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "cbz %[loops], 2f\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
@@ -186,10 +185,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -201,6 +199,8 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
@@ -230,23 +230,34 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -325,112 +336,112 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "mov z28.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
@@ -470,23 +481,42 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -565,48 +595,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -618,49 +650,46 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "mov z28.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
@@ -676,7 +705,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -690,11 +718,12 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
@@ -720,8 +749,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
@@ -737,23 +767,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -832,50 +889,52 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
@@ -894,50 +953,47 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z19.b, z5.b[3]\n"
                     "udot z30.s, z19.b, z6.b[3]\n"
                     "udot z31.s, z19.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "mov z28.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
@@ -953,7 +1009,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -962,7 +1017,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
                     "udot z26.s, z19.b, z2.b[3]\n"
@@ -976,14 +1030,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
@@ -1006,8 +1062,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
@@ -1031,23 +1088,58 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z19.b, z5.b[3]\n"
                     "udot z30.s, z19.b, z6.b[3]\n"
                     "udot z31.s, z19.b, z7.b[3]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1126,46 +1218,48 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
@@ -1205,84 +1299,79 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -1300,7 +1389,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -1313,39 +1401,41 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
@@ -1354,6 +1444,8 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -1392,23 +1484,82 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1487,48 +1638,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -1575,85 +1728,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -1671,7 +1819,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -1679,7 +1826,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -1693,47 +1839,52 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -1780,23 +1931,90 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1875,48 +2093,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
@@ -1972,86 +2192,81 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -2069,7 +2284,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -2077,7 +2291,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -2086,7 +2299,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -2100,47 +2312,135 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "b 5f\n"
+                    "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -2195,23 +2495,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2290,49 +2583,51 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
@@ -2396,87 +2691,82 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z23.b, z5.b[3]\n"
                     "udot z30.s, z23.b, z6.b[3]\n"
                     "udot z31.s, z23.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -2494,7 +2784,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -2502,7 +2791,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -2511,7 +2799,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -2520,7 +2807,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
                     "udot z26.s, z23.b, z2.b[3]\n"
@@ -2534,47 +2820,144 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "b 5f\n"
+                    "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -2637,23 +3020,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z23.b, z5.b[3]\n"
                     "udot z30.s, z23.b, z6.b[3]\n"
                     "udot z31.s, z23.b, z7.b[3]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2732,54 +3108,56 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
@@ -2856,88 +3234,84 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -2955,7 +3329,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -2963,7 +3336,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -2972,7 +3344,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -2981,7 +3352,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
@@ -2999,7 +3369,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
@@ -3007,55 +3376,62 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
@@ -3133,23 +3509,124 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3228,52 +3705,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
@@ -3361,88 +3840,85 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -3460,7 +3936,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -3468,7 +3943,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -3477,7 +3951,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -3486,7 +3959,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
@@ -3504,7 +3976,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
@@ -3512,7 +3983,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -3521,53 +3991,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -3656,23 +4133,133 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3751,52 +4338,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
@@ -3893,82 +4482,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
@@ -3995,12 +4582,10 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z28.s, z20.b, z4.b[0]\n"
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -4009,7 +4594,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -4018,7 +4602,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
@@ -4036,7 +4619,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
@@ -4044,7 +4626,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -4053,7 +4634,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -4062,53 +4642,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -4206,23 +4793,142 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -4301,52 +5007,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
@@ -4452,82 +5160,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z19.b, z5.b[3]\n"
                     "udot z30.s, z19.b, z6.b[3]\n"
                     "udot z31.s, z19.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
@@ -4559,7 +5265,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -4568,7 +5273,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -4577,7 +5281,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
@@ -4595,7 +5298,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
@@ -4603,7 +5305,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -4612,7 +5313,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -4621,7 +5321,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
                     "udot z26.s, z19.b, z2.b[3]\n"
@@ -4630,53 +5329,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z19.b, z5.b[3]\n"
                     "udot z30.s, z19.b, z6.b[3]\n"
                     "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -4783,23 +5489,151 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z19.b, z5.b[3]\n"
                     "udot z30.s, z19.b, z6.b[3]\n"
                     "udot z31.s, z19.b, z7.b[3]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -4878,52 +5712,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
@@ -5046,82 +5882,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
@@ -5162,7 +5996,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -5171,7 +6004,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
@@ -5189,7 +6021,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
@@ -5197,7 +6028,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -5206,7 +6036,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -5215,7 +6044,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -5233,7 +6061,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -5241,53 +6068,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -5411,23 +6245,168 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -5506,52 +6485,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
@@ -5683,82 +6664,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
@@ -5808,7 +6787,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
@@ -5826,7 +6804,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
@@ -5834,7 +6811,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -5843,7 +6819,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -5852,7 +6827,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -5870,7 +6844,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -5878,7 +6851,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -5887,53 +6859,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -6066,23 +7045,177 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -6161,52 +7294,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
@@ -6347,82 +7482,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
@@ -6493,12 +7626,10 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -6507,7 +7638,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -6516,7 +7646,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -6534,7 +7663,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -6542,7 +7670,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -6551,7 +7678,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -6560,53 +7686,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -6748,23 +7881,186 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -6844,52 +8140,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
@@ -7039,83 +8337,81 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z23.b, z5.b[3]\n"
                     "udot z30.s, z23.b, z6.b[3]\n"
                     "udot z31.s, z23.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
@@ -7190,7 +8486,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -7199,7 +8494,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -7208,7 +8502,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -7226,7 +8519,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -7234,7 +8526,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -7243,7 +8534,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -7252,7 +8542,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
                     "udot z26.s, z23.b, z2.b[3]\n"
@@ -7266,49 +8555,235 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "b 5f\n"
+                    "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
@@ -7458,23 +8933,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z23.b, z5.b[3]\n"
                     "udot z30.s, z23.b, z6.b[3]\n"
                     "udot z31.s, z23.b, z7.b[3]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
index eec842d09f..fdb4f584d8 100644
--- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
@@ -110,7 +110,7 @@ public:
     QuantizeWrapper operator=(const QuantizeWrapper &) = delete;
 
     QuantizeWrapper(const GemmArgs &args, const Requantize32 &qp) : _params(qp), _args(args), _barrier(args._maxthreads) {
-        GemmArgs newargs = GemmArgs(args._ci, args._Msize, args._Nsize, args._Ksize, args._nbatches, args._nmulti, Activation(), args._maxthreads, nullptr);
+        GemmArgs newargs = GemmArgs(args._ci, args._Msize, args._Nsize, args._Ksize, args._Ksections, args._nbatches, args._nmulti, args._indirect_input, Activation(), args._maxthreads);
         _subgemm = gemm<To, Tgemm>(newargs);
 
         if (_subgemm == nullptr) {
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp
index cac02cf28e..111d01ed3a 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.cpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp
@@ -301,6 +301,179 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
             out_ptr1 += 16;
         }
 
+        // We are often quantizing one block of interleaved kernel output at a time - these are three registers
+        // wide.  Special case that here.
+        if (regs==3) {
+            regs -= 3;
+
+            int32x4_t v_mul0;
+            int32x4_t v_mul1;
+            int32x4_t v_mul2;
+
+            int32x4_t v_shf0;
+            int32x4_t v_shf1;
+            int32x4_t v_shf2;
+
+            int32x4_t v_shf0l;
+            int32x4_t v_shf1l;
+            int32x4_t v_shf2l;
+
+            if (per_channel) {
+                v_mul0 = vld1q_s32(perch_mul_ptr);
+                v_mul1 = vld1q_s32(perch_mul_ptr + 4);
+                v_mul2 = vld1q_s32(perch_mul_ptr + 8);
+                perch_mul_ptr += 12;
+
+                v_shf0 = vld1q_s32(perch_shift_ptr);
+                v_shf1 = vld1q_s32(perch_shift_ptr + 4);
+                v_shf2 = vld1q_s32(perch_shift_ptr + 8);
+                perch_shift_ptr += 12;
+
+                if (do_left_shift) {
+                    v_shf0l = vld1q_s32(perch_shiftl_ptr);
+                    v_shf1l = vld1q_s32(perch_shiftl_ptr + 4);
+                    v_shf2l = vld1q_s32(perch_shiftl_ptr + 8);
+                    perch_shiftl_ptr += 12;
+                }
+            } else {
+                v_mul0=v_mul1=v_mul2=v_mul;
+                v_shf0=v_shf1=v_shf2=v_right_shift;
+                v_shf0l=v_shf1l=v_shf2l=v_left_shift;
+            }
+
+            // Load column pointers
+            int32x4_t v_col0 = vld1q_s32(colptr);
+            int32x4_t v_col1 = vld1q_s32(colptr + 4);
+            int32x4_t v_col2 = vld1q_s32(colptr + 8);
+            colptr += 12;
+
+            // Load input data (row 0);
+            int32x4_t v_in00 = vld1q_s32(in_ptr);
+            int32x4_t v_in01 = vld1q_s32(in_ptr + 4);
+            int32x4_t v_in02 = vld1q_s32(in_ptr + 8);
+            in_ptr += 12;
+
+            // Load input data (row 1);
+            int32x4_t v_in10 = vld1q_s32(in_ptr1);
+            int32x4_t v_in11 = vld1q_s32(in_ptr1 + 4);
+            int32x4_t v_in12 = vld1q_s32(in_ptr1 + 8);
+            in_ptr1 += 12;
+
+            // Add on row bias and column bias
+            v_in00 = vaddq_s32(v_in00, v_row_sum);
+            v_in01 = vaddq_s32(v_in01, v_row_sum);
+            v_in02 = vaddq_s32(v_in02, v_row_sum);
+
+            v_in10 = vaddq_s32(v_in10, v_row_sum1);
+            v_in11 = vaddq_s32(v_in11, v_row_sum1);
+            v_in12 = vaddq_s32(v_in12, v_row_sum1);
+
+            v_in00 = vaddq_s32(v_in00, v_col0);
+            v_in01 = vaddq_s32(v_in01, v_col1);
+            v_in02 = vaddq_s32(v_in02, v_col2);
+
+            v_in10 = vaddq_s32(v_in10, v_col0);
+            v_in11 = vaddq_s32(v_in11, v_col1);
+            v_in12 = vaddq_s32(v_in12, v_col2);
+
+            // Quantize
+
+            // If a left shift is needed it needs to happen first.
+            if (do_left_shift) {
+                v_in00 = vrshlq_s32(v_in00, v_shf0l);
+                v_in01 = vrshlq_s32(v_in01, v_shf1l);
+                v_in02 = vrshlq_s32(v_in02, v_shf2l);
+
+                v_in10 = vrshlq_s32(v_in10, v_shf0l);
+                v_in11 = vrshlq_s32(v_in11, v_shf1l);
+                v_in12 = vrshlq_s32(v_in12, v_shf2l);
+            }
+
+            // Multiply
+            v_in00 = vqrdmulhq_s32(v_in00, v_mul0);
+            v_in01 = vqrdmulhq_s32(v_in01, v_mul1);
+            v_in02 = vqrdmulhq_s32(v_in02, v_mul2);
+
+            v_in10 = vqrdmulhq_s32(v_in10, v_mul0);
+            v_in11 = vqrdmulhq_s32(v_in11, v_mul1);
+            v_in12 = vqrdmulhq_s32(v_in12, v_mul2);
+
+            // Compute and add on corrective offset
+            if (do_shift_correction) {
+                int32x4_t v_temp00 = vandq_s32(v_in00, v_shf0);
+                int32x4_t v_temp01 = vandq_s32(v_in01, v_shf1);
+                int32x4_t v_temp02 = vandq_s32(v_in02, v_shf2);
+
+                int32x4_t v_temp10 = vandq_s32(v_in10, v_shf0);
+                int32x4_t v_temp11 = vandq_s32(v_in11, v_shf1);
+                int32x4_t v_temp12 = vandq_s32(v_in12, v_shf2);
+
+                v_temp00 = vshrq_n_s32(v_temp00, 31);
+                v_temp01 = vshrq_n_s32(v_temp01, 31);
+                v_temp02 = vshrq_n_s32(v_temp02, 31);
+
+                v_temp10 = vshrq_n_s32(v_temp10, 31);
+                v_temp11 = vshrq_n_s32(v_temp11, 31);
+                v_temp12 = vshrq_n_s32(v_temp12, 31);
+
+                v_in00 = vqaddq_s32(v_in00, v_temp00);
+                v_in01 = vqaddq_s32(v_in01, v_temp01);
+                v_in02 = vqaddq_s32(v_in02, v_temp02);
+
+                v_in10 = vqaddq_s32(v_in10, v_temp10);
+                v_in11 = vqaddq_s32(v_in11, v_temp11);
+                v_in12 = vqaddq_s32(v_in12, v_temp12);
+            }
+
+            v_in00 = vrshlq_s32(v_in00, v_shf0);
+            v_in01 = vrshlq_s32(v_in01, v_shf1);
+            v_in02 = vrshlq_s32(v_in02, v_shf2);
+
+            v_in10 = vrshlq_s32(v_in10, v_shf0);
+            v_in11 = vrshlq_s32(v_in11, v_shf1);
+            v_in12 = vrshlq_s32(v_in12, v_shf2);
+
+            v_in00 = vaddq_s32(v_in00, v_c_offset);
+            v_in01 = vaddq_s32(v_in01, v_c_offset);
+            v_in02 = vaddq_s32(v_in02, v_c_offset);
+
+            v_in10 = vaddq_s32(v_in10, v_c_offset);
+            v_in11 = vaddq_s32(v_in11, v_c_offset);
+            v_in12 = vaddq_s32(v_in12, v_c_offset);
+
+            v_in00 = vmaxq_s32(v_in00, v_minval);
+            v_in01 = vmaxq_s32(v_in01, v_minval);
+            v_in02 = vmaxq_s32(v_in02, v_minval);
+
+            v_in10 = vmaxq_s32(v_in10, v_minval);
+            v_in11 = vmaxq_s32(v_in11, v_minval);
+            v_in12 = vmaxq_s32(v_in12, v_minval);
+
+            v_in00 = vminq_s32(v_in00, v_maxval);
+            v_in01 = vminq_s32(v_in01, v_maxval);
+            v_in02 = vminq_s32(v_in02, v_maxval);
+
+            v_in10 = vminq_s32(v_in10, v_maxval);
+            v_in11 = vminq_s32(v_in11, v_maxval);
+            v_in12 = vminq_s32(v_in12, v_maxval);
+
+            int16x8_t v_uz00 = vuzp1q_s16(vreinterpretq_s16_s32(v_in00), vreinterpretq_s16_s32(v_in01));
+            int16x8_t v_uz01 = vuzp1q_s16(vreinterpretq_s16_s32(v_in02), vreinterpretq_s16_s32(v_in02));
+
+            int16x8_t v_uz10 = vuzp1q_s16(vreinterpretq_s16_s32(v_in10), vreinterpretq_s16_s32(v_in11));
+            int16x8_t v_uz11 = vuzp1q_s16(vreinterpretq_s16_s32(v_in12), vreinterpretq_s16_s32(v_in12));
+
+            int8x16_t v_uz0 = vuzp1q_s8(vreinterpretq_s8_s16(v_uz00), vreinterpretq_s8_s16(v_uz01));
+            int8x16_t v_uz1 = vuzp1q_s8(vreinterpretq_s8_s16(v_uz10), vreinterpretq_s8_s16(v_uz11));
+
+            vst1q_lane_s64(reinterpret_cast<int64_t *>(out_ptr), vreinterpretq_s64_s8(v_uz0), 0);
+            vst1q_lane_s32(reinterpret_cast<int32_t *>(out_ptr + 8), vreinterpretq_s32_s8(v_uz0), 2);
+            out_ptr += 12;
+            vst1q_lane_s64(reinterpret_cast<int64_t *>(out_ptr1), vreinterpretq_s64_s8(v_uz1), 0);
+            vst1q_lane_s32(reinterpret_cast<int32_t *>(out_ptr1 + 8), vreinterpretq_s32_s8(v_uz1), 2);
+            out_ptr1 += 12;
+        }
+
         while (regs--) {
             int32x4_t v_mul0;
             int32x4_t v_shf0;
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.hpp b/src/core/NEON/kernels/arm_gemm/quantized.hpp
index b0e0c3b580..3f3443025c 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.hpp
@@ -23,6 +23,8 @@
  */
 #pragma once
 
+#include "utils.hpp" // IndirectInputArg
+
 namespace arm_gemm {
 
 template<typename Tin, typename Tout>
@@ -39,4 +41,8 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
                       const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth,
                       unsigned int multi, unsigned int first_col);
 
+template<typename T>
+void row_sums_indirect(unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<T> A_arg,
+                       size_t M, int32_t *output_ptr, const Requantize32 *qp);
+
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
new file mode 100644
index 0000000000..5433676558
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
@@ -0,0 +1,1160 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "quantized.hpp"
+#include "utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+template<>
+void row_sums_indirect(
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, int32_t *out_ptr, const Requantize32 *qp
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings;
+        const unsigned int *string_lengths;
+        unsigned int input_initial_col;
+    } ka;
+
+    unsigned long flags=0;
+    void *input_ptr;
+    size_t input_offset;
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        input_offset=A_arg.direct.stride;
+    }
+
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+
+    __asm__ __volatile__(
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v2.4s }, [x19]\n"
+      "neg v2.4s, v2.4s\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 86f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 69f\n"
+      "beq 52f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 35f\n"
+      "beq 18f\n"
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "movi v0.4s, #0x0\n"
+      "mov x9, #0x0\n"
+      "mov x28, #0x0\n"
+      "2:"  // Height 1: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 3f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "cbnz x28, 4f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "b 4f\n"
+      "3:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "4:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "blt 8f\n"
+      "cmp x27, #0x20\n"
+      "blt 7f\n"
+      "5:"  // Height 1: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "blt 6f\n"
+      "sadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "6:"  // Height 1: Multiply loop: unique 1: no collapse
+      "sadalp v1.8h, v31.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 5b\n"
+      "7:"  // Height 1: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "sadalp v1.8h, v31.16b\n"
+      "8:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 17f\n"
+      "tbz x27, #3, 12f\n"
+      "ldr d31, [x26], #0x8\n"
+      "tbz x27, #2, 10f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "tbz x27, #1, 9f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "b 16f\n"
+      "9:"  // Height 1: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "b 16f\n"
+      "10:"  // Height 1: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 11f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "b 16f\n"
+      "11:"  // Height 1: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "b 16f\n"
+      "12:"  // Height 1: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 14f\n"
+      "ldr s31, [x26], #0x4\n"
+      "tbz x27, #1, 13f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "b 16f\n"
+      "13:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "b 16f\n"
+      "14:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 15f\n"
+      "ldr h31, [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "b 16f\n"
+      "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "16:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "sadalp v1.8h, v31.16b\n"
+      "17:"  // Height 1: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 2b\n"
+      "sadalp v0.4s, v1.8h\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "str s0, [%x[out_ptr]], #0x4\n"
+      "b 104f\n"
+      "18:"  // Height 2
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "19:"  // Height 2: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "cbnz x28, 21f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "b 21f\n"
+      "20:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "21:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "blt 25f\n"
+      "cmp x27, #0x20\n"
+      "blt 24f\n"
+      "22:"  // Height 2: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "blt 23f\n"
+      "sadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "sadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "23:"  // Height 2: Multiply loop: unique 2: no collapse
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 22b\n"
+      "24:"  // Height 2: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "25:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 34f\n"
+      "tbz x27, #3, 29f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "tbz x27, #2, 27f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "tbz x27, #1, 26f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "b 33f\n"
+      "26:"  // Height 2: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "b 33f\n"
+      "27:"  // Height 2: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 28f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "b 33f\n"
+      "28:"  // Height 2: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "b 33f\n"
+      "29:"  // Height 2: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 31f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "tbz x27, #1, 30f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "b 33f\n"
+      "30:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "b 33f\n"
+      "31:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 32f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "b 33f\n"
+      "32:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "33:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "34:"  // Height 2: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 19b\n"
+      "sadalp v0.4s, v1.8h\n"
+      "sadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "str d0, [%x[out_ptr]], #0x8\n"
+      "b 104f\n"
+      "35:"  // Height 3
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "36:"  // Height 3: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 37f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "cbnz x28, 38f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "b 38f\n"
+      "37:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "38:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "blt 42f\n"
+      "cmp x27, #0x20\n"
+      "blt 41f\n"
+      "39:"  // Height 3: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "blt 40f\n"
+      "sadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "sadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "sadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "40:"  // Height 3: Multiply loop: unique 3: no collapse
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 39b\n"
+      "41:"  // Height 3: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "add x24, x24, #0x10\n"
+      "42:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 51f\n"
+      "tbz x27, #3, 46f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "tbz x27, #2, 44f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "tbz x27, #1, 43f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "b 50f\n"
+      "43:"  // Height 3: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "b 50f\n"
+      "44:"  // Height 3: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 45f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "b 50f\n"
+      "45:"  // Height 3: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "b 50f\n"
+      "46:"  // Height 3: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 48f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "tbz x27, #1, 47f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "b 50f\n"
+      "47:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "b 50f\n"
+      "48:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 49f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "b 50f\n"
+      "49:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "50:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "51:"  // Height 3: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 36b\n"
+      "sadalp v0.4s, v1.8h\n"
+      "sadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "sadalp v26.4s, v27.8h\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "addp v26.4s, v26.4s, v26.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "str d0, [%x[out_ptr]], #0x8\n"
+      "addp v26.4s, v26.4s, v26.4s\n"
+      "mul v26.4s, v26.4s, v2.4s\n"
+      "str s26, [%x[out_ptr]], #0x4\n"
+      "b 104f\n"
+      "52:"  // Height 4
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v24.8h, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "53:"  // Height 4: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 54f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "ldr x23, [x19, #0x18]\n"
+      "cbnz x28, 55f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "b 55f\n"
+      "54:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "add x23, x24, %x[input_offset]\n"
+      "55:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "blt 59f\n"
+      "cmp x27, #0x20\n"
+      "blt 58f\n"
+      "56:"  // Height 4: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "blt 57f\n"
+      "sadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "sadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "sadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "sadalp v23.4s, v24.8h\n"
+      "movi v24.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "57:"  // Height 4: Multiply loop: unique 4: no collapse
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 56b\n"
+      "58:"  // Height 4: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "59:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 68f\n"
+      "tbz x27, #3, 63f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "tbz x27, #2, 61f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "tbz x27, #1, 60f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "b 67f\n"
+      "60:"  // Height 4: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "b 67f\n"
+      "61:"  // Height 4: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 62f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "b 67f\n"
+      "62:"  // Height 4: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "b 67f\n"
+      "63:"  // Height 4: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 65f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "tbz x27, #1, 64f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "b 67f\n"
+      "64:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "b 67f\n"
+      "65:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 66f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "b 67f\n"
+      "66:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "67:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "68:"  // Height 4: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 53b\n"
+      "sadalp v0.4s, v1.8h\n"
+      "sadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "sadalp v26.4s, v27.8h\n"
+      "sadalp v23.4s, v24.8h\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+      "b 104f\n"
+      "69:"  // Height 5
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v24.8h, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v21.8h, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "70:"  // Height 5: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 71f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "ldr x23, [x19, #0x18]\n"
+      "ldr x22, [x19, #0x20]\n"
+      "cbnz x28, 72f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 72f\n"
+      "71:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "add x23, x24, %x[input_offset]\n"
+      "add x22, x23, %x[input_offset]\n"
+      "72:"  // Height 5: input setup done
+      "cmp x27, #0x10\n"
+      "blt 76f\n"
+      "cmp x27, #0x20\n"
+      "blt 75f\n"
+      "73:"  // Height 5: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "blt 74f\n"
+      "sadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "sadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "sadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "sadalp v23.4s, v24.8h\n"
+      "movi v24.8h, #0x0\n"
+      "sadalp v20.4s, v21.8h\n"
+      "movi v21.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "74:"  // Height 5: Multiply loop: unique 5: no collapse
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "sadalp v21.8h, v19.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 73b\n"
+      "75:"  // Height 5: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "sadalp v21.8h, v19.16b\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "76:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x27, 85f\n"
+      "tbz x27, #3, 80f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "tbz x27, #2, 78f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "tbz x27, #1, 77f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v19.h }[6], [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v19.b }[14], [x22]\n"
+      "b 84f\n"
+      "77:"  // Height 5: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v19.b }[12], [x22]\n"
+      "b 84f\n"
+      "78:"  // Height 5: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 79f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v19.h }[4], [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v19.b }[10], [x22]\n"
+      "b 84f\n"
+      "79:"  // Height 5: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v19.b }[8], [x22]\n"
+      "b 84f\n"
+      "80:"  // Height 5: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 82f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "tbz x27, #1, 81f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v19.h }[2], [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v19.b }[6], [x22]\n"
+      "b 84f\n"
+      "81:"  // Height 5: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v19.b }[4], [x22]\n"
+      "b 84f\n"
+      "82:"  // Height 5: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 83f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h19, [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v19.b }[2], [x22]\n"
+      "b 84f\n"
+      "83:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b19, [x22, #0x0]\n"
+      "84:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "sadalp v21.8h, v19.16b\n"
+      "85:"  // Height 5: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 70b\n"
+      "sadalp v0.4s, v1.8h\n"
+      "sadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "sadalp v26.4s, v27.8h\n"
+      "sadalp v23.4s, v24.8h\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "sadalp v20.4s, v21.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "mul v20.4s, v20.4s, v2.4s\n"
+      "str s20, [%x[out_ptr]], #0x4\n"
+      "b 104f\n"
+      "86:"  // Height 6
+      "movi v1.8h, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v24.8h, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v21.8h, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v18.8h, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "87:"  // Height 6: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 88f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "ldr x23, [x19, #0x18]\n"
+      "ldr x22, [x19, #0x20]\n"
+      "ldr x20, [x19, #0x28]\n"
+      "cbnz x28, 89f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 89f\n"
+      "88:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "add x23, x24, %x[input_offset]\n"
+      "add x22, x23, %x[input_offset]\n"
+      "add x20, x22, %x[input_offset]\n"
+      "89:"  // Height 6: input setup done
+      "cmp x27, #0x10\n"
+      "blt 93f\n"
+      "cmp x27, #0x20\n"
+      "blt 92f\n"
+      "90:"  // Height 6: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "ldr q16, [x20, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "add x20, x20, #0x10\n"
+      "blt 91f\n"
+      "sadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "sadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "sadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "sadalp v23.4s, v24.8h\n"
+      "movi v24.8h, #0x0\n"
+      "sadalp v20.4s, v21.8h\n"
+      "movi v21.8h, #0x0\n"
+      "sadalp v17.4s, v18.8h\n"
+      "movi v18.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "91:"  // Height 6: Multiply loop: unique 6: no collapse
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "sadalp v21.8h, v19.16b\n"
+      "sadalp v18.8h, v16.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 90b\n"
+      "92:"  // Height 6: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "ldr q16, [x20, #0x0]\n"
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "sadalp v21.8h, v19.16b\n"
+      "sadalp v18.8h, v16.16b\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "add x20, x20, #0x10\n"
+      "93:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x27, 102f\n"
+      "tbz x27, #3, 97f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "tbz x27, #2, 95f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "tbz x27, #1, 94f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v19.h }[6], [x22], #0x2\n"
+      "ld1 { v16.h }[6], [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v19.b }[14], [x22]\n"
+      "ld1 { v16.b }[14], [x20]\n"
+      "b 101f\n"
+      "94:"  // Height 6: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v19.b }[12], [x22]\n"
+      "ld1 { v16.b }[12], [x20]\n"
+      "b 101f\n"
+      "95:"  // Height 6: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 96f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v19.h }[4], [x22], #0x2\n"
+      "ld1 { v16.h }[4], [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v19.b }[10], [x22]\n"
+      "ld1 { v16.b }[10], [x20]\n"
+      "b 101f\n"
+      "96:"  // Height 6: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v19.b }[8], [x22]\n"
+      "ld1 { v16.b }[8], [x20]\n"
+      "b 101f\n"
+      "97:"  // Height 6: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 99f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "tbz x27, #1, 98f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v19.h }[2], [x22], #0x2\n"
+      "ld1 { v16.h }[2], [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v19.b }[6], [x22]\n"
+      "ld1 { v16.b }[6], [x20]\n"
+      "b 101f\n"
+      "98:"  // Height 6: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v19.b }[4], [x22]\n"
+      "ld1 { v16.b }[4], [x20]\n"
+      "b 101f\n"
+      "99:"  // Height 6: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 100f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h19, [x22], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v19.b }[2], [x22]\n"
+      "ld1 { v16.b }[2], [x20]\n"
+      "b 101f\n"
+      "100:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b19, [x22, #0x0]\n"
+      "ldr b16, [x20, #0x0]\n"
+      "101:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "sadalp v21.8h, v19.16b\n"
+      "sadalp v18.8h, v16.16b\n"
+      "102:"  // Height 6: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x21\n"
+      "bne 87b\n"
+      "sadalp v0.4s, v1.8h\n"
+      "sadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "sadalp v26.4s, v27.8h\n"
+      "sadalp v23.4s, v24.8h\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "sadalp v20.4s, v21.8h\n"
+      "sadalp v17.4s, v18.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "addp v20.4s, v20.4s, v17.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "mul v20.4s, v20.4s, v2.4s\n"
+      "str d20, [%x[out_ptr]], #0x8\n"
+      "beq 104f\n"
+      "tbz %x[flags], #3, 103f\n"
+      "add %x[input_offset], %x[input_offset], #0x6\n"
+      "b 1b\n"
+      "103:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, %x[input_offset], %x[input_ptr]\n"
+      "b 1b\n"
+      "104:"  // Exit
+
+      : [M] "+r" (M), [input_offset] "+r" (input_offset), [input_ptr] "+r" (input_ptr), [out_ptr] "+r" (out_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
new file mode 100644
index 0000000000..f5709d92ac
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
@@ -0,0 +1,1160 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "quantized.hpp"
+#include "utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+template<>
+void row_sums_indirect(
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, int32_t *out_ptr, const Requantize32 *qp
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings;
+        const unsigned int *string_lengths;
+        unsigned int input_initial_col;
+    } ka;
+
+    unsigned long flags=0;
+    void *input_ptr;
+    size_t input_offset;
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        input_offset=A_arg.direct.stride;
+    }
+
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+
+    __asm__ __volatile__(
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v2.4s }, [x19]\n"
+      "neg v2.4s, v2.4s\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 86f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 69f\n"
+      "beq 52f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 35f\n"
+      "beq 18f\n"
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "movi v0.4s, #0x0\n"
+      "mov x9, #0x0\n"
+      "mov x28, #0x0\n"
+      "2:"  // Height 1: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 3f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "cbnz x28, 4f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "b 4f\n"
+      "3:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "4:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "blt 8f\n"
+      "cmp x27, #0x20\n"
+      "blt 7f\n"
+      "5:"  // Height 1: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "blt 6f\n"
+      "uadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "6:"  // Height 1: Multiply loop: unique 1: no collapse
+      "uadalp v1.8h, v31.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 5b\n"
+      "7:"  // Height 1: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "uadalp v1.8h, v31.16b\n"
+      "8:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 17f\n"
+      "tbz x27, #3, 12f\n"
+      "ldr d31, [x26], #0x8\n"
+      "tbz x27, #2, 10f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "tbz x27, #1, 9f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "b 16f\n"
+      "9:"  // Height 1: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "b 16f\n"
+      "10:"  // Height 1: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 11f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "b 16f\n"
+      "11:"  // Height 1: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "b 16f\n"
+      "12:"  // Height 1: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 14f\n"
+      "ldr s31, [x26], #0x4\n"
+      "tbz x27, #1, 13f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "b 16f\n"
+      "13:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "b 16f\n"
+      "14:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 15f\n"
+      "ldr h31, [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "b 16f\n"
+      "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "16:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "uadalp v1.8h, v31.16b\n"
+      "17:"  // Height 1: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 2b\n"
+      "uadalp v0.4s, v1.8h\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "str s0, [%x[out_ptr]], #0x4\n"
+      "b 104f\n"
+      "18:"  // Height 2
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "19:"  // Height 2: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "cbnz x28, 21f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "b 21f\n"
+      "20:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "21:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "blt 25f\n"
+      "cmp x27, #0x20\n"
+      "blt 24f\n"
+      "22:"  // Height 2: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "blt 23f\n"
+      "uadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "uadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "23:"  // Height 2: Multiply loop: unique 2: no collapse
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 22b\n"
+      "24:"  // Height 2: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "25:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 34f\n"
+      "tbz x27, #3, 29f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "tbz x27, #2, 27f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "tbz x27, #1, 26f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "b 33f\n"
+      "26:"  // Height 2: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "b 33f\n"
+      "27:"  // Height 2: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 28f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "b 33f\n"
+      "28:"  // Height 2: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "b 33f\n"
+      "29:"  // Height 2: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 31f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "tbz x27, #1, 30f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "b 33f\n"
+      "30:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "b 33f\n"
+      "31:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 32f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "b 33f\n"
+      "32:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "33:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "34:"  // Height 2: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 19b\n"
+      "uadalp v0.4s, v1.8h\n"
+      "uadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "str d0, [%x[out_ptr]], #0x8\n"
+      "b 104f\n"
+      "35:"  // Height 3
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "36:"  // Height 3: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 37f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "cbnz x28, 38f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "b 38f\n"
+      "37:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "38:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "blt 42f\n"
+      "cmp x27, #0x20\n"
+      "blt 41f\n"
+      "39:"  // Height 3: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "blt 40f\n"
+      "uadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "uadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "uadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "40:"  // Height 3: Multiply loop: unique 3: no collapse
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 39b\n"
+      "41:"  // Height 3: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "add x24, x24, #0x10\n"
+      "42:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 51f\n"
+      "tbz x27, #3, 46f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "tbz x27, #2, 44f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "tbz x27, #1, 43f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "b 50f\n"
+      "43:"  // Height 3: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "b 50f\n"
+      "44:"  // Height 3: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 45f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "b 50f\n"
+      "45:"  // Height 3: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "b 50f\n"
+      "46:"  // Height 3: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 48f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "tbz x27, #1, 47f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "b 50f\n"
+      "47:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "b 50f\n"
+      "48:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 49f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "b 50f\n"
+      "49:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "50:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "51:"  // Height 3: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 36b\n"
+      "uadalp v0.4s, v1.8h\n"
+      "uadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "uadalp v26.4s, v27.8h\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "addp v26.4s, v26.4s, v26.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "str d0, [%x[out_ptr]], #0x8\n"
+      "addp v26.4s, v26.4s, v26.4s\n"
+      "mul v26.4s, v26.4s, v2.4s\n"
+      "str s26, [%x[out_ptr]], #0x4\n"
+      "b 104f\n"
+      "52:"  // Height 4
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v24.8h, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "53:"  // Height 4: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 54f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "ldr x23, [x19, #0x18]\n"
+      "cbnz x28, 55f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "b 55f\n"
+      "54:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "add x23, x24, %x[input_offset]\n"
+      "55:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "blt 59f\n"
+      "cmp x27, #0x20\n"
+      "blt 58f\n"
+      "56:"  // Height 4: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "blt 57f\n"
+      "uadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "uadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "uadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "uadalp v23.4s, v24.8h\n"
+      "movi v24.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "57:"  // Height 4: Multiply loop: unique 4: no collapse
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 56b\n"
+      "58:"  // Height 4: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "59:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 68f\n"
+      "tbz x27, #3, 63f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "tbz x27, #2, 61f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "tbz x27, #1, 60f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "b 67f\n"
+      "60:"  // Height 4: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "b 67f\n"
+      "61:"  // Height 4: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 62f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "b 67f\n"
+      "62:"  // Height 4: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "b 67f\n"
+      "63:"  // Height 4: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 65f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "tbz x27, #1, 64f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "b 67f\n"
+      "64:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "b 67f\n"
+      "65:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 66f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "b 67f\n"
+      "66:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "67:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "68:"  // Height 4: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 53b\n"
+      "uadalp v0.4s, v1.8h\n"
+      "uadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "uadalp v26.4s, v27.8h\n"
+      "uadalp v23.4s, v24.8h\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+      "b 104f\n"
+      "69:"  // Height 5
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v24.8h, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v21.8h, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "70:"  // Height 5: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 71f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "ldr x23, [x19, #0x18]\n"
+      "ldr x22, [x19, #0x20]\n"
+      "cbnz x28, 72f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 72f\n"
+      "71:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "add x23, x24, %x[input_offset]\n"
+      "add x22, x23, %x[input_offset]\n"
+      "72:"  // Height 5: input setup done
+      "cmp x27, #0x10\n"
+      "blt 76f\n"
+      "cmp x27, #0x20\n"
+      "blt 75f\n"
+      "73:"  // Height 5: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "blt 74f\n"
+      "uadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "uadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "uadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "uadalp v23.4s, v24.8h\n"
+      "movi v24.8h, #0x0\n"
+      "uadalp v20.4s, v21.8h\n"
+      "movi v21.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "74:"  // Height 5: Multiply loop: unique 5: no collapse
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "uadalp v21.8h, v19.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 73b\n"
+      "75:"  // Height 5: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "uadalp v21.8h, v19.16b\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "76:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x27, 85f\n"
+      "tbz x27, #3, 80f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "tbz x27, #2, 78f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "tbz x27, #1, 77f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v19.h }[6], [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v19.b }[14], [x22]\n"
+      "b 84f\n"
+      "77:"  // Height 5: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v19.b }[12], [x22]\n"
+      "b 84f\n"
+      "78:"  // Height 5: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 79f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v19.h }[4], [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v19.b }[10], [x22]\n"
+      "b 84f\n"
+      "79:"  // Height 5: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v19.b }[8], [x22]\n"
+      "b 84f\n"
+      "80:"  // Height 5: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 82f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "tbz x27, #1, 81f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v19.h }[2], [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v19.b }[6], [x22]\n"
+      "b 84f\n"
+      "81:"  // Height 5: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v19.b }[4], [x22]\n"
+      "b 84f\n"
+      "82:"  // Height 5: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 83f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h19, [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v19.b }[2], [x22]\n"
+      "b 84f\n"
+      "83:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b19, [x22, #0x0]\n"
+      "84:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "uadalp v21.8h, v19.16b\n"
+      "85:"  // Height 5: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 70b\n"
+      "uadalp v0.4s, v1.8h\n"
+      "uadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "uadalp v26.4s, v27.8h\n"
+      "uadalp v23.4s, v24.8h\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "uadalp v20.4s, v21.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "mul v20.4s, v20.4s, v2.4s\n"
+      "str s20, [%x[out_ptr]], #0x4\n"
+      "b 104f\n"
+      "86:"  // Height 6
+      "movi v1.8h, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v24.8h, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v21.8h, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v18.8h, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "87:"  // Height 6: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 88f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "ldr x23, [x19, #0x18]\n"
+      "ldr x22, [x19, #0x20]\n"
+      "ldr x20, [x19, #0x28]\n"
+      "cbnz x28, 89f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 89f\n"
+      "88:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "add x23, x24, %x[input_offset]\n"
+      "add x22, x23, %x[input_offset]\n"
+      "add x20, x22, %x[input_offset]\n"
+      "89:"  // Height 6: input setup done
+      "cmp x27, #0x10\n"
+      "blt 93f\n"
+      "cmp x27, #0x20\n"
+      "blt 92f\n"
+      "90:"  // Height 6: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "ldr q16, [x20, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "add x20, x20, #0x10\n"
+      "blt 91f\n"
+      "uadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "uadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "uadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "uadalp v23.4s, v24.8h\n"
+      "movi v24.8h, #0x0\n"
+      "uadalp v20.4s, v21.8h\n"
+      "movi v21.8h, #0x0\n"
+      "uadalp v17.4s, v18.8h\n"
+      "movi v18.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "91:"  // Height 6: Multiply loop: unique 6: no collapse
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "uadalp v21.8h, v19.16b\n"
+      "uadalp v18.8h, v16.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 90b\n"
+      "92:"  // Height 6: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "ldr q16, [x20, #0x0]\n"
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "uadalp v21.8h, v19.16b\n"
+      "uadalp v18.8h, v16.16b\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "add x20, x20, #0x10\n"
+      "93:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x27, 102f\n"
+      "tbz x27, #3, 97f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "tbz x27, #2, 95f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "tbz x27, #1, 94f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v19.h }[6], [x22], #0x2\n"
+      "ld1 { v16.h }[6], [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v19.b }[14], [x22]\n"
+      "ld1 { v16.b }[14], [x20]\n"
+      "b 101f\n"
+      "94:"  // Height 6: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v19.b }[12], [x22]\n"
+      "ld1 { v16.b }[12], [x20]\n"
+      "b 101f\n"
+      "95:"  // Height 6: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 96f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v19.h }[4], [x22], #0x2\n"
+      "ld1 { v16.h }[4], [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v19.b }[10], [x22]\n"
+      "ld1 { v16.b }[10], [x20]\n"
+      "b 101f\n"
+      "96:"  // Height 6: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v19.b }[8], [x22]\n"
+      "ld1 { v16.b }[8], [x20]\n"
+      "b 101f\n"
+      "97:"  // Height 6: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 99f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "tbz x27, #1, 98f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v19.h }[2], [x22], #0x2\n"
+      "ld1 { v16.h }[2], [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v19.b }[6], [x22]\n"
+      "ld1 { v16.b }[6], [x20]\n"
+      "b 101f\n"
+      "98:"  // Height 6: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v19.b }[4], [x22]\n"
+      "ld1 { v16.b }[4], [x20]\n"
+      "b 101f\n"
+      "99:"  // Height 6: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 100f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h19, [x22], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v19.b }[2], [x22]\n"
+      "ld1 { v16.b }[2], [x20]\n"
+      "b 101f\n"
+      "100:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b19, [x22, #0x0]\n"
+      "ldr b16, [x20, #0x0]\n"
+      "101:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "uadalp v21.8h, v19.16b\n"
+      "uadalp v18.8h, v16.16b\n"
+      "102:"  // Height 6: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x21\n"
+      "bne 87b\n"
+      "uadalp v0.4s, v1.8h\n"
+      "uadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "uadalp v26.4s, v27.8h\n"
+      "uadalp v23.4s, v24.8h\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "uadalp v20.4s, v21.8h\n"
+      "uadalp v17.4s, v18.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "addp v20.4s, v20.4s, v17.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "mul v20.4s, v20.4s, v2.4s\n"
+      "str d20, [%x[out_ptr]], #0x8\n"
+      "beq 104f\n"
+      "tbz %x[flags], #3, 103f\n"
+      "add %x[input_offset], %x[input_offset], #0x6\n"
+      "b 1b\n"
+      "103:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, %x[input_offset], %x[input_ptr]\n"
+      "b 1b\n"
+      "104:"  // Exit
+
+      : [M] "+r" (M), [input_offset] "+r" (input_offset), [input_ptr] "+r" (input_ptr), [out_ptr] "+r" (out_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
index 1d3aee7911..4669be9993 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
@@ -23,8 +23,10 @@
  */
 #pragma once
 
+#include "convolver.hpp"
 #include "mergeresults.hpp"
 #include "transform.hpp"
+#include "interleave_indirect.hpp"
 
 namespace arm_gemm {
 
@@ -39,14 +41,26 @@ namespace arm_gemm {
  * The optional 'block' parameter is for kernels using dot-product type
  * instructions like UDOT and SDOT.
  */
-template<typename TOperand, typename TResult, unsigned int height, unsigned int width, unsigned int block=1>
+template<typename TOperand, typename TResult, unsigned int height, unsigned int width, unsigned int block=1, bool integrate_sums=false>
 class StdTransformsFixed
 {
 public:
     template<typename TIn>
     void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
-                  const int ymax, const int k0, const int kmax) const {
-        Transform<height, block, false>(out, in, stride, y0, ymax, k0, kmax);
+                  const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) const {
+        Interleave<height, block, VLType::None>(out, in, stride, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    template<typename TIn>
+    void PrepareA_indirect(TOperand *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
+                           const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        IndirectInterleave<height, block, VLType::None>(out, ptr, stringlen, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    template<typename TIn>
+    void PrepareA_convolution(TOperand *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
+                              const int y0, const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        ConvolutionInterleave<height, block, VLType::None>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
     }
 
     template<typename TIn>
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
index 13c4c477c6..3256d919ea 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
@@ -23,6 +23,7 @@
  */
 #pragma once
 
+#include "convolver.hpp"
 #include "mergeresults.hpp"
 #include "transform.hpp"
 
@@ -38,20 +39,32 @@ namespace arm_gemm {
  * The optional 'block' parameter is for kernels using dot-product type
  * instructions like UDOT and SDOT.
  */
-template<typename TOperand, typename TResult, unsigned int height, unsigned int width_vectors, unsigned int block=1, unsigned int mmla=1>
+template<typename TOperand, typename TResult, unsigned int height, unsigned int width_vectors, unsigned int block=1, unsigned int mmla=1, bool integrate_sums=false>
 class StdTransformsSVE
 {
 public:
     template<typename TIn>
     void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
-                  const int ymax, const int k0, const int kmax) {
-        Transform<height, block, false>(out, in, stride, y0, ymax, k0, kmax);
+                  const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        Interleave<height, block, VLType::None>(out, in, stride, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    template<typename TIn>
+    void PrepareA_indirect(TOperand *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
+                           const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        IndirectInterleave<height, block, VLType::None>(out, ptr, stringlen, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    template<typename TIn>
+    void PrepareA_convolution(TOperand *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
+                              const int y0, const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        ConvolutionInterleave<height, block, VLType::None>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
     }
 
     template<typename TIn>
     void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
                   const int xmax, const int k0, const int kmax) {
-        Transform<width_vectors, block,  true, true>(out, in, stride, x0, xmax, k0, kmax);
+        Transform<width_vectors, block,  true, VLType::SVE>(out, in, stride, x0, xmax, k0, kmax);
     }
 
     template<typename TOut>
diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index c6ea079882..5efeee5d35 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp
@@ -38,13 +38,13 @@ namespace arm_gemm {
  * Need to cope with the work requested in either dimension not actually
  * being a multiple of the block sizes.
  */
-template <unsigned int tIntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize, bool sve>
+template <unsigned int tIntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize, VLType vlt>
 struct TransformImpl {
     template <typename TOut, typename TIn>
     static void Transform(TOut* out, const TIn* const in, const int stride,
                           const int y0, const int ymax, const int x0, const int xmax) {
         // For SVE cases we multiply the interleave factor by the vector length.
-        const unsigned int IntBy = tIntBy * (sve ? get_vector_length<TOut>() / BlockBy : 1);
+        const unsigned int IntBy = tIntBy * (vlt == VLType::SVE ? get_vector_length<TOut>() / BlockBy : 1);
 
         const int n_whole_y_blocks = (ymax - y0) / IntBy;
         const int y_remainders = (ymax - y0) % IntBy;
@@ -105,13 +105,13 @@ struct TransformImpl {
 };
 
 /*****************************************************************************/
-template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, bool sve=false, typename TOut, typename TIn>
+template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, VLType vlt=VLType::None, typename TOut, typename TIn>
 void Transform(
   TOut* out, const TIn* const in, const int stride,
   const int k0, const int kmax, const int x0, const int xmax
 ) {
   // Redirect to a specialised implementation predicated on argument size.
-  TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn), sve>::Transform(
+  TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn), vlt>::Transform(
     out, in, stride, k0, kmax, x0, xmax
   );
 }
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
deleted file mode 100644
index 2df5d1bd28..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<typename T>
-inline void TransformImpl<6, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
-    const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
-    bool first = true;
-
-    uint32_t zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop
-
-    for (int y=y0; y<ymax; y+=6) {
-        const uint32_t *inptr0 = inptr + y * ldin + k0;
-        const uint32_t *inptr1 = inptr0 + ldin;
-        const uint32_t *inptr2 = inptr1 + ldin;
-        const uint32_t *inptr3 = inptr2 + ldin;
-        const uint32_t *inptr4 = inptr3 + ldin;
-        const uint32_t *inptr5 = inptr4 + ldin;
-
-        //prefetch_2x(inptr0);
-        //prefetch_2x(inptr1);
-        //prefetch_2x(inptr2);
-        //prefetch_2x(inptr3);
-        //prefetch_2x(inptr4);
-        //prefetch_2x(inptr5);
-
-        int x=(kmax-k0);
-        for (;(x>7) || first;x-=8) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
-            if ((y + 5) >= ymax) {
-                switch ((y + 5) - ymax) {
-                    case 4:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr5 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x<=7) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            __asm __volatile (
-                // Load up 8 elements (2 vectors) from each of 8 sources.
-                "VLD1.32	{d0-d3}, [%[inptr0]]!\n"   // q0=A0A1A2A3
-                "VLD1.32	{d4-d7}, [%[inptr1]]!\n"   // q2=B0B1B2B3
-                "VLD1.32	{d8-d11}, [%[inptr2]]!\n"  // q4=C0C1C2C3
-                "VZIP.32	q0, q4\n"     // q0=A0C0A1C1, q4 = A2C2A3C3
-                "VLD1.32	{d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
-                "VZIP.32	q2, q6\n"     // q2=B0D0B1D1, q6 = B2D2B3D3
-                "VLD1.32	{d16-d19}, [%[inptr4]]!\n"
-                "VLD1.32	{d20-d23}, [%[inptr5]]!\n"
-                "VZIP.32	q8, q10\n"    // q8=E0F0E1F1, q10 = E2F2E3F3
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "VZIP.32	q0, q2\n"    // q0 = A0B0C0D0, q2 = A1B1C1D1
-
-                // Store first elements
-                "VST1.32	{d0-d1}, [%[outptr]]!\n"
-                "VST1.32	{d16}, [%[outptr]]!\n"
-
-                "VZIP.32	q4, q6\n"    // q4 = A2B2C2D2, q6 = A3B3C3D3
-
-                // Store second elements
-                "VST1.32	{d4-d5}, [%[outptr]]!\n"
-                "VZIP.32	q1, q5\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "VST1.32	{d17}, [%[outptr]]!\n"
-                "VZIP.32	q3, q7\n"
-
-                // Store third elements
-                "VZIP.32	q9, q11\n"
-                "VST1.32	{d8-d9}, [%[outptr]]!\n"
-                "VZIP.32	q1, q3\n"
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "VST1.32	{d20}, [%[outptr]]!\n"
-
-                // Store fourth elements
-                "VZIP.32	q5, q7\n"
-                "VST1.32	{d12-d13}, [%[outptr]]!\n"
-                ASM_PREFETCH("[%[inptr3], #128]")
-                "VST1.32	{d21}, [%[outptr]]!\n"
-
-                // Fifth
-                "VST1.32	{d2-d3}, [%[outptr]]!\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "VST1.32	{d18}, [%[outptr]]!\n"
-
-                // Sixth
-                "VST1.32	{d6-d7}, [%[outptr]]!\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "VST1.32	{d19}, [%[outptr]]!\n"
-
-                // Seventh
-                "VST1.32	{d10-d11}, [%[outptr]]!\n"
-                "VST1.32	{d22}, [%[outptr]]!\n"
-
-                // Eighth
-                "VST1.32	{d14-d15}, [%[outptr]]!\n"
-                "VST1.32	{d23}, [%[outptr]]!\n"
-
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr)
-                :
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "memory"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-        }
-    }
-}
-
-#endif  // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
index 8f0b8ae63f..3ce1d328a7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
@@ -30,22 +30,22 @@
 // Generic unblocked transposed 8x32-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<8, 1, true, 4, 4, false>::Transform(
+inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
   // Redirect to a 16x uint16_t specialisation
-  TransformImpl<16, 1, true, 2, 2, false>::Transform(
+  TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     reinterpret_cast<uint16_t *>(out),
     reinterpret_cast<const uint16_t *>(in),
     stride*2, x0*2, xmax*2, k0, kmax
   );
 }
 
-// Generic 12x16-bit sized specialisation
+// Generic 16x16-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
@@ -117,7 +117,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(con
 
 template <>
 template <>
-inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     uint16_t* out, const uint16_t* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
deleted file mode 100644
index 9b6f4de543..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-#include "../utils.hpp"
-
-template<>
-template<typename T>
-void TransformImpl<4, 16, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint8_t *outptr = (uint8_t *)out;
-    const uint8_t *inptr = (uint8_t *)in;
-
-    uint8_t zerobuff[16] = { 0 };
-
-    for (int y=y0; y<ymax; y+=4) {
-        const uint8_t *inptr0 = inptr + static_cast<intptr_t>(y) * ldin + k0;
-        const uint8_t *inptr1 = inptr0 + ldin;
-        const uint8_t *inptr2 = inptr1 + ldin;
-        const uint8_t *inptr3 = inptr2 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-
-        int x=(kmax-k0);
-        for (;x>15;x-=16) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            if ((y + 3) >= ymax) {
-                switch ((y + 3) - ymax) {
-                    case 2:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr3 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            __asm __volatile (
-                "LDR	q0, [%[inptr0]], #16\n"
-                ASM_PREFETCH("[%[inptr0], #176]")
-                "LDR	q1, [%[inptr1]], #16\n"
-                ASM_PREFETCH("[%[inptr1], #176]")
-                "STP	q0, q1, [%[outptr]], #32\n"
-                "LDR	q0, [%[inptr2]], #16\n"
-                ASM_PREFETCH("[%[inptr2], #176]")
-                "LDR	q1, [%[inptr3]], #16\n"
-                ASM_PREFETCH("[%[inptr3], #176]")
-                "STP	q0, q1, [%[outptr]], #32\n"
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [outptr] "+r" (outptr)
-                :
-                : "v0", "v1"
-            );
-        }
-
-        if (x>0) {
-            /* Need to duplicate this here, in case we didn't run the main loop. */
-            if ((y + 3) >= ymax) {
-                switch ((y + 3) - ymax) {
-                    case 2:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr3 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */
-            auto f = [&outptr, x](const uint8_t *&p) {
-                for (int i=0; i<16; i++) {
-                    if (i < x) {
-                        *outptr++ = *p++;
-                    } else {
-                        *outptr++ = 0;
-                    }
-                }
-            };
-
-            f(inptr0);
-            f(inptr1);
-            f(inptr2);
-            f(inptr3);
-        }
-    }
-}
-
-#endif  // __aarch64__
-\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
deleted file mode 100644
index 3d912c4675..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<typename T>
-void TransformImpl<8, 1, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint16_t *outptr = (uint16_t *)out;
-    const uint16_t *inptr = (const uint16_t *)in;
-    bool first=true;
-
-    uint16_t zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop
-
-    for (int y=y0; y<ymax; y+=8) {
-        const uint16_t *inptr0 = inptr + y * ldin + k0;
-        const uint16_t *inptr1 = inptr0 + ldin;
-        const uint16_t *inptr2 = inptr1 + ldin;
-        const uint16_t *inptr3 = inptr2 + ldin;
-        const uint16_t *inptr4 = inptr3 + ldin;
-        const uint16_t *inptr5 = inptr4 + ldin;
-        const uint16_t *inptr6 = inptr5 + ldin;
-        const uint16_t *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;(x>7) || first;x-=8) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    case 6:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 5:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 4:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr5 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr6 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr7 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x <= 7) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            int skippf = (x & 31);
-            __asm __volatile (
-                // Load up 8 elements (1 vector) from each of 8 sources.
-                "CBNZ	%w[skippf], 1f\n"
-                ASM_PREFETCH("[%[inptr0], #128]")
-                ASM_PREFETCH("[%[inptr1], #128]")
-                ASM_PREFETCH("[%[inptr2], #128]")
-                ASM_PREFETCH("[%[inptr3], #128]")
-                "1:\n"
-
-                "LDR	q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
-                "LDR	q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
-                "LDR	q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
-                "LDR	q6, [%[inptr6]], #16\n"
-                "ZIP1	v8.8h, v0.8h, v4.8h\n"  // q8=A0E0A1E1A2E2A3E3
-                "ZIP2	v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
-                "ZIP1	v9.8h, v2.8h, v6.8h\n"  // q9=C0G0C1G1C2G2C3G3
-                "ZIP2	v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
-                "LDR	q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
-                "LDR	q5, [%[inptr5]], #16\n"
-                "LDR	q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
-                "LDR	q7, [%[inptr7]], #16\n"
-                "ZIP1	v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
-                "ZIP2	v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
-                "ZIP1	v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
-                "ZIP2	v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
-
-                "ZIP1	v12.8h,  v8.8h,  v9.8h\n" // q20=A0C0E0G0A1C1E1G1
-                "ZIP2	v20.8h,  v8.8h,  v9.8h\n"
-                "ZIP1	v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
-                "ZIP2	v21.8h, v10.8h, v11.8h\n"
-
-                "CBNZ	%w[skippf], 2f\n"
-                ASM_PREFETCH("[%[inptr4], #112]")
-                ASM_PREFETCH("[%[inptr5], #112]")
-                ASM_PREFETCH("[%[inptr6], #112]")
-                ASM_PREFETCH("[%[inptr7], #112]")
-                "2:\n"
-
-                "ZIP1	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v30.8h, v16.8h, v17.8h\n"
-                "ZIP1	v23.8h, v18.8h, v19.8h\n"
-                "ZIP2	v31.8h, v18.8h, v19.8h\n"
-
-                "ZIP1	v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
-                "ZIP2	v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
-                "STP	q14, q15, [%[outptr]], #32\n" // Write back first two elements
-
-                "ZIP1	v0.8h, v20.8h, v21.8h\n"
-                "ZIP2	v1.8h, v20.8h, v21.8h\n"
-                "STP	q0, q1, [%[outptr]], #32\n" // Write back next two elements
-
-                "ZIP1	v2.8h, v22.8h, v23.8h\n"
-                "ZIP2	v3.8h, v22.8h, v23.8h\n"
-                "STP	q2, q3, [%[outptr]], #32\n" // Write back next two elements
-
-                "ZIP1	v4.8h, v30.8h, v31.8h\n"
-                "ZIP2	v5.8h, v30.8h, v31.8h\n"
-                "STP	q4, q5, [%[outptr]], #32\n" // Write back last two elements
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                : [skippf] "r" (skippf)
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
-                  "v25", "v26", "v27", "v28", "v29", "v30", "v31", "memory"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-            *outptr++ = *inptr6++;
-            *outptr++ = *inptr7++;
-        }
-    }
-}
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
deleted file mode 100644
index 701d688af2..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint32_t *outptr = (uint32_t *)out;
-    const uint32_t *inptr = (uint32_t *)in;
-    bool first = true;
-
-    uint32_t zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop
-
-    for (int y=y0; y<ymax; y+=8) {
-        const uint32_t *inptr0 = inptr + y * ldin + k0;
-        const uint32_t *inptr1 = inptr0 + ldin;
-        const uint32_t *inptr2 = inptr1 + ldin;
-        const uint32_t *inptr3 = inptr2 + ldin;
-        const uint32_t *inptr4 = inptr3 + ldin;
-        const uint32_t *inptr5 = inptr4 + ldin;
-        const uint32_t *inptr6 = inptr5 + ldin;
-        const uint32_t *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;(x>7) || first;x-=8) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    case 6:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 5:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 4:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr5 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr6 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr7 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x<=7) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            __asm __volatile (
-                // Load up 8 elements (2 vectors) from each of 8 sources.
-                "LDP        q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
-                "LDP        q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
-                "LDP        q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
-                "ZIP1       v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "LDP        q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
-                "ZIP1       v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
-                "LDP        q8, q9, [%[inptr4]], #32\n"
-                "LDP        q10, q11, [%[inptr5]], #32\n"
-                "LDP        q12, q13, [%[inptr6]], #32\n"
-                "ZIP1       v18.4s, v8.4s, v12.4s\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "LDP        q14, q15, [%[inptr7]], #32\n"
-                "ZIP1       v19.4s, v10.4s, v14.4s\n"
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP2       v16.4s, v0.4s, v4.4s\n"
-                ASM_PREFETCH("[%[inptr3], #128]")
-                "ZIP2       v17.4s, v2.4s, v6.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
-
-                "ZIP2       v18.4s, v8.4s, v12.4s\n"
-                "ZIP2       v19.4s, v10.4s, v14.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP1       v16.4s, v1.4s, v5.4s\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "ZIP1       v17.4s, v3.4s, v7.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Third element
-
-                "ZIP1       v18.4s, v9.4s, v13.4s\n"
-                "ZIP1       v19.4s, v11.4s, v15.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Fourth element
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                ASM_PREFETCH("[%[inptr6], #128]")
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP2       v16.4s, v1.4s, v5.4s\n"
-                "ZIP2       v17.4s, v3.4s, v7.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Fifth element
-
-                "ZIP2       v18.4s, v9.4s, v13.4s\n"
-                ASM_PREFETCH("[%[inptr7], #128]")
-                "ZIP2       v19.4s, v11.4s, v15.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Sixth element
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Seventh element
-
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                :
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-            *outptr++ = *inptr6++;
-            *outptr++ = *inptr7++;
-        }
-    }
-}
-
-#endif  // __aarch64__ && !__ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp
deleted file mode 100644
index 2546cc571a..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 4, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint8_t *outptr = reinterpret_cast<uint8_t *>(out);
-    const uint8_t *inptr = reinterpret_cast<const uint8_t *>(in);
-    bool first = true;
-
-    /* Helper functions to copy blocks about used for odd case. */
-    class t {
-    public:
-        static inline void copy_4_inc(uint8_t *&out, const uint8_t *&in) {
-            uint32_t *out_word = reinterpret_cast<uint32_t *>(out);
-            const uint32_t *in_word = reinterpret_cast<const uint32_t *>(in);
-
-            *out_word++ = *in_word++;
-
-            out = reinterpret_cast<uint8_t *>(out_word);
-            in = reinterpret_cast<const uint8_t *>(in_word);
-        }
-
-        static inline void copy_pad(uint8_t *&out, const uint8_t *&in, size_t count) {
-            for (unsigned int i=0; i<4; i++) {
-                if (i < count) {
-                    *out++ = *in++;
-                } else {
-                    *out++ = 0;
-                }
-            }
-        }
-    };
-
-    uint8_t zerobuff[64] = { 0 }; // 32 for asm loop plus up to 31 for overflow loop
-
-    for (int y=y0; y<ymax; y+=8) {
-        const uint8_t *inptr0 = inptr + y * ldin + k0;
-        const uint8_t *inptr1 = inptr0 + ldin;
-        const uint8_t *inptr2 = inptr1 + ldin;
-        const uint8_t *inptr3 = inptr2 + ldin;
-        const uint8_t *inptr4 = inptr3 + ldin;
-        const uint8_t *inptr5 = inptr4 + ldin;
-        const uint8_t *inptr6 = inptr5 + ldin;
-        const uint8_t *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;(x>31) || first;x-=32) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=32. */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    case 6:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 5:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 4:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr5 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr6 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr7 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x<=31) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            __asm __volatile (
-                // Load up 8 elements (2 vectors) from each of 8 sources.
-                "LDP        q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
-                "LDP        q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
-                "LDP        q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
-                "ZIP1       v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "LDP        q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
-                "ZIP1       v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
-                "LDP        q8, q9, [%[inptr4]], #32\n"
-                "LDP        q10, q11, [%[inptr5]], #32\n"
-                "LDP        q12, q13, [%[inptr6]], #32\n"
-                "ZIP1       v18.4s, v8.4s, v12.4s\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "LDP        q14, q15, [%[inptr7]], #32\n"
-                "ZIP1       v19.4s, v10.4s, v14.4s\n"
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP2       v16.4s, v0.4s, v4.4s\n"
-                ASM_PREFETCH("[%[inptr3], #128]")
-                "ZIP2       v17.4s, v2.4s, v6.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
-
-                "ZIP2       v18.4s, v8.4s, v12.4s\n"
-                "ZIP2       v19.4s, v10.4s, v14.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP1       v16.4s, v1.4s, v5.4s\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "ZIP1       v17.4s, v3.4s, v7.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Third element
-
-                "ZIP1       v18.4s, v9.4s, v13.4s\n"
-                "ZIP1       v19.4s, v11.4s, v15.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Fourth element
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                ASM_PREFETCH("[%[inptr6], #128]")
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP2       v16.4s, v1.4s, v5.4s\n"
-                "ZIP2       v17.4s, v3.4s, v7.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Fifth element
-
-                "ZIP2       v18.4s, v9.4s, v13.4s\n"
-                ASM_PREFETCH("[%[inptr7], #128]")
-                "ZIP2       v19.4s, v11.4s, v15.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Sixth element
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Seventh element
-
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                :
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory"
-            );
-        }
-
-        // Copy any leftover blocks of 4 a complete block at a time.
-        for (;x>4;x-=4) {
-            t::copy_4_inc(outptr, inptr0);
-            t::copy_4_inc(outptr, inptr1);
-            t::copy_4_inc(outptr, inptr2);
-            t::copy_4_inc(outptr, inptr3);
-            t::copy_4_inc(outptr, inptr4);
-            t::copy_4_inc(outptr, inptr5);
-            t::copy_4_inc(outptr, inptr6);
-            t::copy_4_inc(outptr, inptr7);
-        }
-
-        // Final block with padding, if any.
-        if (x > 0) {
-            t::copy_pad(outptr, inptr0, x);
-            t::copy_pad(outptr, inptr1, x);
-            t::copy_pad(outptr, inptr2, x);
-            t::copy_pad(outptr, inptr3, x);
-            t::copy_pad(outptr, inptr4, x);
-            t::copy_pad(outptr, inptr5, x);
-            t::copy_pad(outptr, inptr6, x);
-            t::copy_pad(outptr, inptr7, x);
-        }
-    }
-}
-
-#endif  // __aarch64__ && !__ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
deleted file mode 100644
index a342d6c3d1..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<>
-inline void TransformImpl<8, 1, false, 4, 2, false>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    float *outptr = out;
-    const __fp16 *inptr = in;
-    bool first = true;
-
-    __fp16 zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop
-
-    for (int y=y0; y<ymax; y+=8) {
-        const __fp16 *inptr0 = inptr + y * ldin + k0;
-        const __fp16 *inptr1 = inptr0 + ldin;
-        const __fp16 *inptr2 = inptr1 + ldin;
-        const __fp16 *inptr3 = inptr2 + ldin;
-        const __fp16 *inptr4 = inptr3 + ldin;
-        const __fp16 *inptr5 = inptr4 + ldin;
-        const __fp16 *inptr6 = inptr5 + ldin;
-        const __fp16 *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;(x>7) || first;x-=8) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    case 6:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 5:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 4:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr5 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr6 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr7 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x<=7) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            __asm __volatile (
-                // Load up 8 elements (2 vectors) from each of 8 sources.
-                "LDR	q0, [%[inptr0]], #16\n"
-                "LDR	q2, [%[inptr1]], #16\n"
-                "FCVTL2	v1.4s, v0.8h\n"
-                "FCVTL	v0.4s, v0.4h\n"
-                "LDR	q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
-                "FCVTL2	v3.4s, v2.8h\n"
-                "FCVTL	v2.4s, v2.4h\n"
-                "FCVTL2	v5.4s, v4.8h\n"
-                "FCVTL	v4.4s, v4.4h\n"
-                "ZIP1	v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "LDR	q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
-                "FCVTL2	v7.4s, v6.8h\n"
-                "FCVTL	v6.4s, v6.4h\n"
-                "ZIP1	v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
-                "LDR	q8, [%[inptr4]], #16\n"
-                "LDR	q10, [%[inptr5]], #16\n"
-                "FCVTL2	v9.4s, v8.8h\n"
-                "FCVTL	v8.4s, v8.4h\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "LDR	q12, [%[inptr6]], #16\n"
-                "FCVTL2	v11.4s, v10.8h\n"
-                "FCVTL	v10.4s, v10.4h\n"
-                "FCVTL2	v13.4s, v12.8h\n"
-                "FCVTL	v12.4s, v12.4h\n"
-                "ZIP1	v18.4s, v8.4s, v12.4s\n"
-                "LDR	q14, [%[inptr7]], #16\n"
-                "FCVTL2	v15.4s, v14.8h\n"
-                "FCVTL	v14.4s, v14.4h\n"
-                "ZIP1	v19.4s, v10.4s, v14.4s\n"
-
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "ZIP1	v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
-                "ZIP1	v21.4s, v18.4s, v19.4s\n"
-                "ZIP2	v22.4s, v16.4s, v17.4s\n"
-                "ZIP2	v23.4s, v18.4s, v19.4s\n"
-                ASM_PREFETCH("[%[inptr3], #128]")
-
-                "ZIP2	v16.4s, v0.4s, v4.4s\n"
-                "ZIP2	v17.4s, v2.4s, v6.4s\n"
-                "STP	q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
-
-                "ZIP2	v18.4s, v8.4s, v12.4s\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "ZIP2	v19.4s, v10.4s, v14.4s\n"
-                "STP	q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
-
-                "ZIP1	v20.4s, v16.4s, v17.4s\n"
-                "ZIP1	v21.4s, v18.4s, v19.4s\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "ZIP2	v22.4s, v16.4s, v17.4s\n"
-                "ZIP2	v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP1	v16.4s, v1.4s, v5.4s\n"
-                "ZIP1	v17.4s, v3.4s, v7.4s\n"
-                ASM_PREFETCH("[%[inptr6], #128]")
-                "STP	q20, q21, [%[outptr]], #32\n" // Third element
-
-                "ZIP1	v18.4s, v9.4s, v13.4s\n"
-                "ZIP1	v19.4s, v11.4s, v15.4s\n"
-                "STP	q22, q23, [%[outptr]], #32\n" // Fourth element
-                ASM_PREFETCH("[%[inptr7], #128]")
-
-                "ZIP1	v20.4s, v16.4s, v17.4s\n"
-                "ZIP1	v21.4s, v18.4s, v19.4s\n"
-                "ZIP2	v22.4s, v16.4s, v17.4s\n"
-                "ZIP2	v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP2	v16.4s, v1.4s, v5.4s\n"
-                "ZIP2	v17.4s, v3.4s, v7.4s\n"
-                "STP	q20, q21, [%[outptr]], #32\n" // Fifth element
-
-                "ZIP2	v18.4s, v9.4s, v13.4s\n"
-                "ZIP2	v19.4s, v11.4s, v15.4s\n"
-                "STP	q22, q23, [%[outptr]], #32\n" // Sixth element
-
-                "ZIP1	v20.4s, v16.4s, v17.4s\n"
-                "ZIP1	v21.4s, v18.4s, v19.4s\n"
-                "STP	q20, q21, [%[outptr]], #32\n" // Seventh element
-
-                "ZIP2	v22.4s, v16.4s, v17.4s\n"
-                "ZIP2	v23.4s, v18.4s, v19.4s\n"
-                "STP	q22, q23, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                :
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-            *outptr++ = *inptr6++;
-            *outptr++ = *inptr7++;
-        }
-    }
-}
-
-#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp
deleted file mode 100644
index 37344a82a9..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
-
-#include <arm_neon.h>
-#include <cstdint>
-
-#include "../asmlib.hpp"
-
-template<>
-template<>
-inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(int16_t *out, const int8_t *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    int16_t *outptr = out;
-    const int8_t *inptr = in;
-    bool first = true;
-
-    int8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop
-
-    for (int y=y0; y<ymax; y+=8) {
-        const int8_t *inptr0 = inptr + y * ldin + k0;
-        const int8_t *inptr1 = inptr0 + ldin;
-        const int8_t *inptr2 = inptr1 + ldin;
-        const int8_t *inptr3 = inptr2 + ldin;
-        const int8_t *inptr4 = inptr3 + ldin;
-        const int8_t *inptr5 = inptr4 + ldin;
-        const int8_t *inptr6 = inptr5 + ldin;
-        const int8_t *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;(x>15) || first;x-=16) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    case 6:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 5:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 4:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr5 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr6 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr7 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x<=15) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            __asm __volatile (
-                // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources.
-                "LDR	q0, [%[inptr0]], #16\n"
-                "LDR	q2, [%[inptr1]], #16\n"
-                "SSHLL2 v1.8h, v0.16b, #0\n"
-                "SSHLL  v0.8h, v0.8b, #0\n"
-                "LDR	q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
-                "SSHLL2 v3.8h, v2.16b, #0\n"
-                "SSHLL  v2.8h, v2.8b, #0\n"
-                "SSHLL2 v5.8h, v4.16b, #0\n"
-                "SSHLL  v4.8h, v4.8b, #0\n"
-                "ZIP1	v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "LDR	q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
-                "SSHLL2 v7.8h, v6.16b, #0\n"
-                "SSHLL  v6.8h, v6.8b, #0\n"
-                "ZIP1	v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1
-                "LDR	q8, [%[inptr4]], #16\n"
-                "LDR	q10, [%[inptr5]], #16\n"
-                "SSHLL2 v9.8h, v8.16b, #0\n"
-                "SSHLL  v8.8h, v8.8b, #0\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "LDR	q12, [%[inptr6]], #16\n"
-                "SSHLL2 v11.8h, v10.16b, #0\n"
-                "SSHLL  v10.8h, v10.8b, #0\n"
-                "SSHLL2 v13.8h, v12.16b, #0\n"
-                "SSHLL  v12.8h, v12.8b, #0\n"
-                "ZIP1	v18.8h, v8.8h, v12.8h\n"
-                "LDR	q14, [%[inptr7]], #16\n"
-                "SSHLL2 v15.8h, v14.16b, #0\n"
-                "SSHLL  v14.8h, v14.8b, #0\n"
-                "ZIP1	v19.8h, v10.8h, v14.8h\n"
-
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "ZIP1	v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1
-                "ZIP1	v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1
-                "ZIP2	v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3
-                "ZIP2	v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3
-                ASM_PREFETCH("[%[inptr3], #128]")
-
-                "ZIP2	v16.8h, v0.8h, v4.8h\n"
-                "ZIP2	v17.8h, v2.8h, v6.8h\n"
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP2	v18.8h, v8.8h, v12.8h\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "ZIP2	v19.8h, v10.8h, v14.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Write back the first element of each source
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-
-                "ZIP1	v20.8h, v16.8h, v17.8h\n"
-                "ZIP1	v21.8h, v18.8h, v19.8h\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "ZIP2	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v23.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Write back the second element of each source
-
-                "ZIP1	v16.8h, v1.8h, v5.8h\n"
-                "ZIP1	v17.8h, v3.8h, v7.8h\n"
-                ASM_PREFETCH("[%[inptr6], #128]")
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP1	v18.8h, v9.8h, v13.8h\n"
-                "ZIP1	v19.8h, v11.8h, v15.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Third element
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-                ASM_PREFETCH("[%[inptr7], #128]")
-
-                "ZIP1	v20.8h, v16.8h, v17.8h\n"
-                "ZIP1	v21.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Fourth element
-                "ZIP2	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v23.8h, v18.8h, v19.8h\n"
-
-                "ZIP2	v16.8h, v1.8h, v5.8h\n"
-                "ZIP2	v17.8h, v3.8h, v7.8h\n"
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP2	v18.8h, v9.8h, v13.8h\n"
-                "ZIP2	v19.8h, v11.8h, v15.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Fifth element
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-
-                "ZIP1	v20.8h, v16.8h, v17.8h\n"
-                "ZIP1	v21.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Sixth element
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP2	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v23.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Seventh element
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                :
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-            *outptr++ = *inptr6++;
-            *outptr++ = *inptr7++;
-        }
-    }
-}
-
-#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp
deleted file mode 100644
index a3a269c9cd..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
-
-#include <arm_neon.h>
-#include <cstdint>
-
-#include "../asmlib.hpp"
-
-template<>
-template<>
-inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(uint16_t *out, const uint8_t *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint16_t *outptr = out;
-    const uint8_t *inptr = in;
-    bool first = true;
-
-    uint8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop
-
-    for (int y=y0; y<ymax; y+=8) {
-        const uint8_t *inptr0 = inptr + y * ldin + k0;
-        const uint8_t *inptr1 = inptr0 + ldin;
-        const uint8_t *inptr2 = inptr1 + ldin;
-        const uint8_t *inptr3 = inptr2 + ldin;
-        const uint8_t *inptr4 = inptr3 + ldin;
-        const uint8_t *inptr5 = inptr4 + ldin;
-        const uint8_t *inptr6 = inptr5 + ldin;
-        const uint8_t *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;(x>15) || first;x-=16) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    case 6:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 5:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 4:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr5 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr6 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr7 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x<=15) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            __asm __volatile (
-                // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources.
-                "LDR	q0, [%[inptr0]], #16\n"
-                "LDR	q2, [%[inptr1]], #16\n"
-                "USHLL2 v1.8h, v0.16b, #0\n"
-                "USHLL  v0.8h, v0.8b, #0\n"
-                "LDR	q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
-                "USHLL2 v3.8h, v2.16b, #0\n"
-                "USHLL  v2.8h, v2.8b, #0\n"
-                "USHLL2 v5.8h, v4.16b, #0\n"
-                "USHLL  v4.8h, v4.8b, #0\n"
-                "ZIP1	v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "LDR	q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
-                "USHLL2 v7.8h, v6.16b, #0\n"
-                "USHLL  v6.8h, v6.8b, #0\n"
-                "ZIP1	v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1
-                "LDR	q8, [%[inptr4]], #16\n"
-                "LDR	q10, [%[inptr5]], #16\n"
-                "USHLL2 v9.8h, v8.16b, #0\n"
-                "USHLL  v8.8h, v8.8b, #0\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "LDR	q12, [%[inptr6]], #16\n"
-                "USHLL2 v11.8h, v10.16b, #0\n"
-                "USHLL  v10.8h, v10.8b, #0\n"
-                "USHLL2 v13.8h, v12.16b, #0\n"
-                "USHLL  v12.8h, v12.8b, #0\n"
-                "ZIP1	v18.8h, v8.8h, v12.8h\n"
-                "LDR	q14, [%[inptr7]], #16\n"
-                "USHLL2 v15.8h, v14.16b, #0\n"
-                "USHLL  v14.8h, v14.8b, #0\n"
-                "ZIP1	v19.8h, v10.8h, v14.8h\n"
-
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "ZIP1	v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1
-                "ZIP1	v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1
-                "ZIP2	v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3
-                "ZIP2	v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3
-                ASM_PREFETCH("[%[inptr3], #128]")
-
-                "ZIP2	v16.8h, v0.8h, v4.8h\n"
-                "ZIP2	v17.8h, v2.8h, v6.8h\n"
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP2	v18.8h, v8.8h, v12.8h\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "ZIP2	v19.8h, v10.8h, v14.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Write back the first element of each source
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-
-                "ZIP1	v20.8h, v16.8h, v17.8h\n"
-                "ZIP1	v21.8h, v18.8h, v19.8h\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "ZIP2	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v23.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Write back the second element of each source
-
-                "ZIP1	v16.8h, v1.8h, v5.8h\n"
-                "ZIP1	v17.8h, v3.8h, v7.8h\n"
-                ASM_PREFETCH("[%[inptr6], #128]")
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP1	v18.8h, v9.8h, v13.8h\n"
-                "ZIP1	v19.8h, v11.8h, v15.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Third element
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-                ASM_PREFETCH("[%[inptr7], #128]")
-
-                "ZIP1	v20.8h, v16.8h, v17.8h\n"
-                "ZIP1	v21.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Fourth element
-                "ZIP2	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v23.8h, v18.8h, v19.8h\n"
-
-                "ZIP2	v16.8h, v1.8h, v5.8h\n"
-                "ZIP2	v17.8h, v3.8h, v7.8h\n"
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP2	v18.8h, v9.8h, v13.8h\n"
-                "ZIP2	v19.8h, v11.8h, v15.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Fifth element
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-
-                "ZIP1	v20.8h, v16.8h, v17.8h\n"
-                "ZIP1	v21.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Sixth element
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP2	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v23.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Seventh element
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                :
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-            *outptr++ = *inptr6++;
-            *outptr++ = *inptr7++;
-        }
-    }
-}
-
-#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
index 5ab5774751..f6233ef503 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
@@ -30,12 +30,12 @@
 // Generic unblocked transposed 6x32-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<6, 1, true, 4, 4, false>::Transform(
+inline void TransformImpl<6, 1, true, 4, 4, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
   // Redirect to a 12 x uint16_t specialisation
-  TransformImpl<12, 1, true, 2, 2, false>::Transform(
+  TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
     reinterpret_cast<uint16_t *>(out),
     reinterpret_cast<const uint16_t *>(in),
     stride*2, x0*2, xmax*2, k0, kmax
@@ -45,7 +45,7 @@ inline void TransformImpl<6, 1, true, 4, 4, false>::Transform(
 // Generic 12x16-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<12, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
@@ -135,7 +135,7 @@ inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(con
 
 template <>
 template <>
-inline void TransformImpl<12, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
     uint16_t* out, const uint16_t* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
index d7de9ff934..c0f3e17d31 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
@@ -110,7 +110,7 @@ inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __
 
 template <>
 template <>
-inline void TransformImpl<12, 1, true, 4, 2, false>::Transform(
+inline void TransformImpl<12, 1, true, 4, 2, VLType::None>::Transform(
     float* out, const __fp16* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
index a137f9360a..bcbe2b84d8 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
@@ -30,12 +30,12 @@
 // Generic unblocked transposed 12x32-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<12, 1, true, 4, 4, false>::Transform(
+inline void TransformImpl<12, 1, true, 4, 4, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
   // Redirect to a 24 x uint16_t specialisation
-  TransformImpl<24, 1, true, 2, 2, false>::Transform(
+  TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
     reinterpret_cast<uint16_t *>(out),
     reinterpret_cast<const uint16_t * const>(in),
     stride*2, x0*2, xmax*2, k0, kmax
@@ -45,7 +45,7 @@ inline void TransformImpl<12, 1, true, 4, 4, false>::Transform(
 // Generic 24x16-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<24, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
@@ -120,7 +120,7 @@ inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(con
 
 template <>
 template <>
-inline void TransformImpl<24, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
     uint16_t* out, const uint16_t* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
index 974be481e7..df68740bb4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
@@ -30,12 +30,12 @@
 // Generic unblocked transposed 8x32-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<8, 1, true, 4, 4, false>::Transform(
+inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
   // Redirect to a 16 x uint16_t specialisation
-  TransformImpl<16, 1, true, 2, 2, false>::Transform(
+  TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     reinterpret_cast<uint16_t *>(out),
     reinterpret_cast<const uint16_t *>(in),
     stride*2, x0*2, xmax*2, k0, kmax
@@ -45,7 +45,7 @@ inline void TransformImpl<8, 1, true, 4, 4, false>::Transform(
 // Generic 16x16-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
@@ -137,7 +137,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(con
 
 template <>
 template <>
-inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     uint16_t* out, const uint16_t* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index b825e1c358..e092c729ba 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -21,22 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "a32_interleave_6way_32bit.hpp"
 #include "a32_transpose_interleave_8way_32bit.hpp"
-#include "a64_block16_interleave4_8bit.hpp"
-#include "a64_interleave_8way_16bit.hpp"
-#include "a64_interleave_8way_32bit.hpp"
-#include "a64_interleave_8way_block4_8bit.hpp"
-#include "a64_interleave_8way_half_to_float.hpp"
-#include "a64_interleave_8way_s8_to_s16.hpp"
-#include "a64_interleave_8way_u8_to_u16.hpp"
 #include "a64_transpose_interleave_12way_16bit.hpp"
 #include "a64_transpose_interleave_12way_half_to_float.hpp"
 #include "a64_transpose_interleave_24way_16bit.hpp"
 #include "a64_transpose_interleave_8way_32bit.hpp"
-#include "sve_interleave_8way_32bit.hpp"
-#include "sve_interleave_8way_block2_16bit.hpp"
-#include "sve_interleave_8way_block2_32bit.hpp"
-#include "sve_interleave_8way_block4_16bit.hpp"
-#include "sve_interleave_8way_block4_8bit.hpp"
-#include "sve_interleave_8way_block8_8bit.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
deleted file mode 100644
index 348d78e3f5..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint32_t *master_outptr = reinterpret_cast<uint32_t *>(out);
-    const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
-
-    for (int y=y0; y<ymax; y+=8)
-    {
-        const int height = ymax-y;
-        const long inwidth = (kmax - k0);
-        const long outwidth = inwidth * 8;
-        long inpos = 0;
-        long outpos = 0;
-
-        uint32_t *outptr = master_outptr;
-        master_outptr += outwidth;
-
-        const uint32_t *inptr0 = inptr + y * ldin + k0;
-        const uint32_t *inptr1 = inptr0 + ldin;
-        const uint32_t *inptr2 = inptr1 + ldin;
-        const uint32_t *inptr3 = inptr2 + ldin;
-        const uint32_t *inptr4 = inptr3 + ldin;
-        const uint32_t *inptr5 = inptr4 + ldin;
-        const uint32_t *inptr6 = inptr5 + ldin;
-        const uint32_t *inptr7 = inptr6 + ldin;
-
-        switch(height)
-        {
-            case 1:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip1 z0.s, z8.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z1.s, z8.s, z4.s\n"
-                    "zip1 z2.s, z9.s, z4.s\n"
-                    "zip2 z3.s, z9.s, z4.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z4.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z4.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 2:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "mov z14.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip1 z0.s, z8.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z1.s, z8.s, z4.s\n"
-                    "zip1 z2.s, z9.s, z4.s\n"
-                    "zip2 z3.s, z9.s, z4.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "zip1 z6.s, z11.s, z14.s\n"
-                    "zip2 z7.s, z11.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 3:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "mov z14.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "zip1 z6.s, z11.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z7.s, z11.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 4:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z4.s\n"
-                    "zip2 z15.s, z3.s, z4.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 5:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z5.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z5.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z5.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z5.s\n"
-                    "zip2 z15.s, z3.s, z5.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 6:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z6.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z6.s\n"
-                    "zip2 z15.s, z3.s, z6.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 7:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z7.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            default:
-            case 8:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
-                    "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-
-        }
-    }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp
deleted file mode 100644
index 234433a0f1..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 2, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint16_t *master_outptr = reinterpret_cast<uint16_t *>(out);
-    const uint16_t *inptr = reinterpret_cast<const uint16_t *>(in);
-
-    for (int y=y0; y<ymax; y+=8)
-    {
-        const int height = ymax-y;
-        const long inwidth = (kmax - k0);
-        const long outwidth = ((inwidth + 1) / 2) * 16;
-        long inpos = 0;
-        long outpos = 0;
-
-        uint16_t *outptr = master_outptr;
-        master_outptr += outwidth;
-
-        const uint16_t *inptr0 = inptr + y * ldin + k0;
-        const uint16_t *inptr1 = inptr0 + ldin;
-        const uint16_t *inptr2 = inptr1 + ldin;
-        const uint16_t *inptr3 = inptr2 + ldin;
-        const uint16_t *inptr4 = inptr3 + ldin;
-        const uint16_t *inptr5 = inptr4 + ldin;
-        const uint16_t *inptr6 = inptr5 + ldin;
-        const uint16_t *inptr7 = inptr6 + ldin;
-
-        switch(height)
-        {
-            case 1:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip1 z0.s, z8.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z1.s, z8.s, z4.s\n"
-                    "zip1 z2.s, z9.s, z4.s\n"
-                    "zip2 z3.s, z9.s, z4.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z4.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z4.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 2:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "mov z14.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip1 z0.s, z8.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z1.s, z8.s, z4.s\n"
-                    "zip1 z2.s, z9.s, z4.s\n"
-                    "zip2 z3.s, z9.s, z4.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "zip1 z6.s, z11.s, z14.s\n"
-                    "zip2 z7.s, z11.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 3:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "mov z14.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "zip1 z6.s, z11.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z7.s, z11.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 4:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z4.s\n"
-                    "zip2 z15.s, z3.s, z4.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 5:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z5.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z5.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z5.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z5.s\n"
-                    "zip2 z15.s, z3.s, z5.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 6:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z6.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z6.s\n"
-                    "zip2 z15.s, z3.s, z6.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 7:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z7.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
-                    "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            default:
-            case 8:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
-                    "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
-                    "ld1h z7.h, p0/z, [%[inptr7], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-
-        }
-    }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
deleted file mode 100644
index f21933b8de..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 2, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint32_t *master_outptr = reinterpret_cast<uint32_t *>(out);
-    const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
-
-    for (int y=y0; y<ymax; y+=8)
-    {
-        const int height = ymax-y;
-        const long inwidth = (kmax - k0);
-        const long outwidth = ((inwidth + 1) / 2) * 16;
-        long inpos = 0;
-        long outpos = 0;
-
-        uint32_t *outptr = master_outptr;
-        master_outptr += outwidth;
-
-        const uint32_t *inptr0 = inptr + y * ldin + k0;
-        const uint32_t *inptr1 = inptr0 + ldin;
-        const uint32_t *inptr2 = inptr1 + ldin;
-        const uint32_t *inptr3 = inptr2 + ldin;
-        const uint32_t *inptr4 = inptr3 + ldin;
-        const uint32_t *inptr5 = inptr4 + ldin;
-        const uint32_t *inptr6 = inptr5 + ldin;
-        const uint32_t *inptr7 = inptr6 + ldin;
-
-        switch(height)
-        {
-            case 1:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z4.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.d, z3.d, z4.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 2:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "mov z14.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 3:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "mov z14.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 4:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z4.d\n"
-                    "zip2 z15.d, z3.d, z4.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 5:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z5.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z5.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z5.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z5.d\n"
-                    "zip2 z15.d, z3.d, z5.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 6:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z6.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z6.d\n"
-                    "zip2 z15.d, z3.d, z6.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 7:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z7.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            default:
-            case 8:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
-                    "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-
-        }
-    }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_16bit.hpp
deleted file mode 100644
index 26e10511a6..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_16bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 4, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint16_t *master_outptr = reinterpret_cast<uint16_t *>(out);
-    const uint16_t *inptr = reinterpret_cast<const uint16_t *>(in);
-
-    for (int y=y0; y<ymax; y+=8)
-    {
-        const int height = ymax-y;
-        const long inwidth = (kmax - k0);
-        const long outwidth = ((inwidth + 3) / 4) * 32;
-        long inpos = 0;
-        long outpos = 0;
-
-        uint16_t *outptr = master_outptr;
-        master_outptr += outwidth;
-
-        const uint16_t *inptr0 = inptr + y * ldin + k0;
-        const uint16_t *inptr1 = inptr0 + ldin;
-        const uint16_t *inptr2 = inptr1 + ldin;
-        const uint16_t *inptr3 = inptr2 + ldin;
-        const uint16_t *inptr4 = inptr3 + ldin;
-        const uint16_t *inptr5 = inptr4 + ldin;
-        const uint16_t *inptr6 = inptr5 + ldin;
-        const uint16_t *inptr7 = inptr6 + ldin;
-
-        switch(height)
-        {
-            case 1:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z4.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.d, z3.d, z4.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 2:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "mov z14.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 3:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "mov z14.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 4:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z4.d\n"
-                    "zip2 z15.d, z3.d, z4.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 5:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z5.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z5.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z5.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z5.d\n"
-                    "zip2 z15.d, z3.d, z5.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 6:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z6.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z6.d\n"
-                    "zip2 z15.d, z3.d, z6.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 7:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z7.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
-                    "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            default:
-            case 8:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
-                    "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
-                    "ld1h z7.h, p0/z, [%[inptr7], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-
-        }
-    }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
deleted file mode 100644
index ed0d58aa91..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 4, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint8_t *master_outptr = reinterpret_cast<uint8_t *>(out);
-    const uint8_t *inptr = reinterpret_cast<const uint8_t *>(in);
-
-    for (int y=y0; y<ymax; y+=8)
-    {
-        const int height = ymax-y;
-        const long inwidth = (kmax - k0);
-        const long outwidth = ((inwidth + 3) / 4) * 32;
-        long inpos = 0;
-        long outpos = 0;
-
-        uint8_t *outptr = master_outptr;
-        master_outptr += outwidth;
-
-        const uint8_t *inptr0 = inptr + y * ldin + k0;
-        const uint8_t *inptr1 = inptr0 + ldin;
-        const uint8_t *inptr2 = inptr1 + ldin;
-        const uint8_t *inptr3 = inptr2 + ldin;
-        const uint8_t *inptr4 = inptr3 + ldin;
-        const uint8_t *inptr5 = inptr4 + ldin;
-        const uint8_t *inptr6 = inptr5 + ldin;
-        const uint8_t *inptr7 = inptr6 + ldin;
-
-        switch(height)
-        {
-            case 1:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip1 z0.s, z8.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z1.s, z8.s, z4.s\n"
-                    "zip1 z2.s, z9.s, z4.s\n"
-                    "zip2 z3.s, z9.s, z4.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z4.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z4.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 2:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "mov z14.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip1 z0.s, z8.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z1.s, z8.s, z4.s\n"
-                    "zip1 z2.s, z9.s, z4.s\n"
-                    "zip2 z3.s, z9.s, z4.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "zip1 z6.s, z11.s, z14.s\n"
-                    "zip2 z7.s, z11.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 3:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "mov z14.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "zip1 z6.s, z11.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z7.s, z11.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 4:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z4.s\n"
-                    "zip2 z15.s, z3.s, z4.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 5:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z5.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z5.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z5.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z5.s\n"
-                    "zip2 z15.s, z3.s, z5.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 6:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z6.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z6.s\n"
-                    "zip2 z15.s, z3.s, z6.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 7:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z7.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
-                    "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            default:
-            case 8:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
-                    "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
-                    "ld1b z7.b, p0/z, [%[inptr7], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-
-        }
-    }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block8_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block8_8bit.hpp
deleted file mode 100644
index b4935e6417..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block8_8bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 8, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint8_t *master_outptr = reinterpret_cast<uint8_t *>(out);
-    const uint8_t *inptr = reinterpret_cast<const uint8_t *>(in);
-
-    for (int y=y0; y<ymax; y+=8)
-    {
-        const int height = ymax-y;
-        const long inwidth = (kmax - k0);
-        const long outwidth = ((inwidth + 7) / 8) * 64;
-        long inpos = 0;
-        long outpos = 0;
-
-        uint8_t *outptr = master_outptr;
-        master_outptr += outwidth;
-
-        const uint8_t *inptr0 = inptr + y * ldin + k0;
-        const uint8_t *inptr1 = inptr0 + ldin;
-        const uint8_t *inptr2 = inptr1 + ldin;
-        const uint8_t *inptr3 = inptr2 + ldin;
-        const uint8_t *inptr4 = inptr3 + ldin;
-        const uint8_t *inptr5 = inptr4 + ldin;
-        const uint8_t *inptr6 = inptr5 + ldin;
-        const uint8_t *inptr7 = inptr6 + ldin;
-
-        switch(height)
-        {
-            case 1:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z4.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.d, z3.d, z4.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 2:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "mov z14.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 3:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "mov z14.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 4:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z4.d\n"
-                    "zip2 z15.d, z3.d, z4.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 5:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z5.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z5.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z5.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z5.d\n"
-                    "zip2 z15.d, z3.d, z5.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 6:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z6.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z6.d\n"
-                    "zip2 z15.d, z3.d, z6.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 7:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z7.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
-                    "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            default:
-            case 8:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
-                    "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
-                    "ld1b z7.b, p0/z, [%[inptr7], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-
-        }
-    }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
index aac5e19ebe..a3216c494f 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
@@ -23,6 +23,8 @@
  */
 #pragma once
 
+#include "../asmlib.hpp"
+
 template <unsigned int IntBy, typename TIn, typename TOut>
 struct TransposeInterleaveCommon {
   // Override the moveblock_1xY methods to improve performance
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index 6e47a97c78..6d483a3b9d 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -24,6 +24,8 @@
 
 #pragma once
 
+#include "arm_gemm.hpp"
+
 #include <cstddef>
 
 // Macro for unreachable code (e.g. impossible default cases on switch)
@@ -32,6 +34,8 @@
 // Paranoid option for the above with assert
 // #define UNREACHABLE(why)   assert(0 && why)
 
+namespace arm_gemm {
+
 template<typename T>
 inline T iceildiv(const T a, const T b) {
     return (a + b - 1) / b;
@@ -48,7 +52,94 @@ inline T roundup(const T a, const T b) {
     }
 }
 
-namespace arm_gemm {
+enum class VLType {
+    None,
+    SVE,
+};
+
+template<typename T>
+struct IndirectOutputArg {
+    struct {
+        T       *base;
+        size_t   stride;
+    } direct = {};
+    struct {
+        T * const *ptr;
+        size_t     offset;
+    } indirect = {};
+    bool is_indirect;
+
+    // Direct
+    IndirectOutputArg(T *base, size_t stride) : is_indirect(false) {
+        direct.base = base;
+        direct.stride = stride;
+    }
+
+    // Indirect
+    IndirectOutputArg(T * const * ptr, size_t offset) : is_indirect(true) {
+        indirect.ptr = ptr;
+        indirect.offset = offset;
+    }
+
+    IndirectOutputArg() : is_indirect(false) {
+        direct.base = nullptr;
+        direct.stride = 0;
+    }
+};
+
+// Check that the provided Requantize32 doesn't have a left shift.
+inline bool quant_no_left_shift(const Requantize32 &qp) {
+    if (qp.per_channel_requant) {
+        return (qp.per_channel_left_shifts == nullptr);
+    } else {
+        return (qp.per_layer_left_shift == 0);
+    }
+}
+
+// Check that the provided Requantize32 is compatible with the "symmetric" hybrid kernels.  These don't include row
+// sums, so the 'b_offset' has to be zero.
+inline bool quant_hybrid_symmetric(const Requantize32 &qp) {
+    return quant_no_left_shift(qp) && qp.b_offset == 0;
+}
+
+// Check that the provided Requantize32 is compatible with the "asymmetric" hybrid kernels.  These don't support per
+// channel quantization.  Technically b_offset==0 cases would work, but it is a waste to sum and then multiply by 0...
+inline bool quant_hybrid_asymmetric(const Requantize32 &qp) {
+    return quant_no_left_shift(qp) /*  && qp.b_offset != 0 */ && qp.per_channel_requant==false;
+}
+
+template<typename T>
+struct IndirectInputArg {
+    struct {
+        const T *base;
+        size_t   stride;
+    } direct = {};
+    struct {
+        const T * const * const * ptr;
+        unsigned int start_row;
+        unsigned int start_col;
+    } indirect = {};
+    bool is_indirect;
+
+    // Direct
+    IndirectInputArg(const T *base, size_t stride) : is_indirect(false) {
+        direct.base = base;
+        direct.stride = stride;
+    }
+
+    // Indirect
+    IndirectInputArg(const T * const * const *ptr, unsigned int start_row, unsigned int start_col) : is_indirect(true) {
+        indirect.ptr = ptr;
+        indirect.start_row = start_row;
+        indirect.start_col = start_col;
+    }
+
+    IndirectInputArg() : is_indirect(false) {
+        direct.base = nullptr;
+        direct.stride = 0;
+    }
+};
+
 namespace utils {
 namespace {
 
diff --git a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
deleted file mode 100644
index 760274dba1..0000000000
--- a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/WindowIterator.h"
-
-using namespace arm_compute;
-
-INEGEMMWrapperKernel::INEGEMMWrapperKernel()
-    : _a(nullptr), _b(nullptr), _c(nullptr), _params(), _gemm_info(), _window3d(), _window_shape()
-{
-}
-
-INEGEMMWrapperKernel::Params INEGEMMWrapperKernel::extract_parameters(const ITensor *a, const ITensor *b, const ITensor *c, const GEMMInfo &gemm_info)
-{
-    Params p;
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(b);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(c);
-
-    // Initalize params
-    p.M       = c->info()->tensor_shape().y();
-    p.N       = c->info()->tensor_shape().x();
-    p.K       = a->info()->tensor_shape().x();
-    p.multis  = b->info()->tensor_shape().z();
-    p.batches = c->info()->tensor_shape().total_size_upper(2) / p.multis; //COMPMID-1423: Agree on and document the layout of gemm inputs/outputs
-
-    // Update M in case of GEMM3D for output
-    if(gemm_info.depth_output_gemm3d() != 0)
-    {
-        p.M       = c->info()->tensor_shape().y() * c->info()->tensor_shape().z();
-        p.batches = c->info()->tensor_shape().total_size_upper(3) / p.multis;
-    }
-
-    return p;
-}
-
-void INEGEMMWrapperKernel::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    _gemm_info = gemm_info;
-    _params    = extract_parameters(a, b, c, gemm_info);
-    _a         = a;
-    _b         = b;
-    _c         = c;
-
-    _window3d     = configure_internal(alpha, beta);
-    _window_shape = _window3d.shape();
-
-    // Convert the 3D window into a 1D window in order to allow the scheduler to arbitrary split it.
-    Window collapsed;
-    collapsed.set(0, Window::Dimension(0, _window3d.num_iterations_total()));
-
-    INEKernel::configure(collapsed);
-}
-
-void INEGEMMWrapperKernel::run(const Window &window, const ThreadInfo &info)
-{
-    const Coordinates start_offset = index2coords(_window_shape, window.x().start());
-    const Coordinates end_offset   = index2coords(_window_shape, window.x().end() - 1);
-
-    run_internal(_window3d, start_offset, end_offset, info);
-}
diff --git a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
deleted file mode 100644
index 92c013260b..0000000000
--- a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_INEGEMMWRAPPERKERNEL_H
-#define SRC_INEGEMMWRAPPERKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Common interface for all the arm_gemm Gemms
- */
-class INEGEMMWrapperKernel : public INEKernel
-{
-public:
-    /** Parameters defining the dimensions of the matrices being multiplied */
-    struct Params
-    {
-        unsigned int M{ 0 };       /**< Rows in output matrix C (and input matrix A). */
-        unsigned int N{ 0 };       /**< Columns in output matrix C (and input matrix B). */
-        unsigned int K{ 0 };       /**< Columns of input matrix A (= rows of input matrix B). */
-        unsigned int batches{ 0 }; /**< Number of "batched" GEMMs (unique A and C, shared B). */
-        unsigned int multis{ 0 };  /**< Number of "multi" GEMMs (unique A, B and C). */
-    };
-
-    static Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *c, const GEMMInfo &gemm_info);
-
-    /** Constructor */
-    INEGEMMWrapperKernel();
-    /** Prevent instances of this class from being copied */
-    INEGEMMWrapperKernel(const INEGEMMWrapperKernel &) = delete;
-    /** Prevent instances of this class from being copied */
-    INEGEMMWrapperKernel &operator=(const INEGEMMWrapperKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    INEGEMMWrapperKernel(INEGEMMWrapperKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEGEMMWrapperKernel &operator=(INEGEMMWrapperKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note The input and output tensor must have the same dimensions
-     *
-     * @param[in]  a         Input tensor (Matrix A)
-     * @param[in]  b         Input tensor (Matrix B)
-     * @param[out] c         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in]  alpha     Scalar multiplier to apply to AB matrix product.
-     * @param[in]  beta      Scalar multiplier to apply to input C matrix before adding product.
-     * @param[in]  gemm_info GEMM meta-data
-     */
-    void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    /** Called as part of configure() after _a, _b, _c and _params have been set.
-     *
-     * @param[in] alpha Scalar multiplier to apply to AB matrix product.
-     * @param[in] beta  Scalar multiplier to apply to input C matrix before adding product.
-     *
-     * @return A 3D execution window.
-     */
-    virtual Window configure_internal(float alpha, float beta) = 0;
-
-    /** Run the kernel from the start to the end offset in window.
-     *
-     * @param[in] window       Window to use for the iteration
-     * @param[in] start_offset Where to start iterating from (In Window coordinates)
-     * @param[in] end_offset   Where to stop iterating (In Window coordinates).
-     * @param[in] info         Info about executing thread and CPU.
-     */
-    virtual void run_internal(const Window &window, const Coordinates &start_offset, const Coordinates &end_offset, const ThreadInfo &info) = 0;
-
-    const ITensor *_a;
-    const ITensor *_b;
-    ITensor       *_c;
-    Params         _params;
-    GEMMInfo       _gemm_info;
-
-private:
-    Window      _window3d;
-    TensorShape _window_shape;
-};
-
-} // namespace arm_compute
-
-#endif /* SRC_INEGEMMRAPPERKERNEL_H */
diff --git a/src/core/NEON/kernels/assembly/arm_gemm.hpp b/src/core/NEON/kernels/assembly/arm_gemm.hpp
index f6421c12ab..3088b080d6 100644
--- a/src/core/NEON/kernels/assembly/arm_gemm.hpp
+++ b/src/core/NEON/kernels/assembly/arm_gemm.hpp
@@ -43,7 +43,9 @@ enum class GemmMethod
     GEMM_INTERLEAVED_2D,
     QUANTIZE_WRAPPER,
     QUANTIZE_WRAPPER_2D,
-    GEMM_HYBRID_QUANTIZED
+    GEMM_HYBRID_QUANTIZED,
+    INDIRECT_GEMM,
+    CONVOLUTION_GEMM
 };
 
 struct KernelDescription
@@ -104,17 +106,19 @@ public:
     unsigned int      _Msize;
     unsigned int      _Nsize;
     unsigned int      _Ksize;
+    unsigned int      _Ksections;
     unsigned int      _nbatches;
     unsigned int      _nmulti;
+    bool              _indirect_input;
     Activation        _act;
     int               _maxthreads;
     const GemmConfig *_cfg;
 
-    GemmArgs(const CPUInfo *ci, const unsigned int M, const unsigned int N,
-             const unsigned int K, const unsigned int nbatches,
-             const unsigned int nmulti, Activation act, const int maxthreads,
+    GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N,
+             unsigned int K, unsigned int Ksections, unsigned int nbatches,
+             unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads,
              const GemmConfig *cfg = nullptr)
-        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), _act(act), _maxthreads(maxthreads), _cfg(cfg)
+        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), _cfg(cfg)
     {
     }
 };
@@ -143,8 +147,8 @@ public:
     Requantize32(const int32_t *bias, size_t bias_multi_stride,
                  int32_t a_offset, int32_t b_offset, int32_t c_offset,
                  int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv)
-        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max(requant_shift, int32_t(0))),
-          per_layer_right_shift(std::min(requant_shift, int32_t(0))), per_layer_mul(requant_mul), minval(minv), maxval(maxv)
+        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
+          per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv)
     {
     }
 
diff --git a/src/core/NEON/kernels/assembly/convolution_parameters.hpp b/src/core/NEON/kernels/assembly/convolution_parameters.hpp
new file mode 100644
index 0000000000..d0ef5b539f
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/convolution_parameters.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <cstdint>
+
+namespace arm_gemm
+{
+/*
+ * Parameter set for "convolution" type GEMM.
+ *
+ * For a "convolution" GEMM, the GEMM parameters (M, K) are specified as if
+ * an im2row had been performed on the input tensor to generate the operand
+ * matrix, but instead this structure describes the convolution parameters
+ * such that this can be done on the fly.
+ *
+ * The parameters describe the convolution details - the notional shape of
+ * the input and output tensors, whether padding is to be applied, the size
+ * of the kernel and a constant value to be used for padding (needed for
+ * quantized tensors).
+ *
+ * The second part describes the layout of the input tensor in memory, which
+ * is assumed to be in NHWC format.  This consists of a base pointer and
+ * strides for columns, rows and batches.  'multis' are not supported for
+ * convolution type GEMMs.
+ */
+struct ConvolutionParameters
+{
+    int64_t input_width;
+    int64_t input_height;
+    int64_t input_channels;
+    int64_t kernel_width;
+    int64_t kernel_height;
+    int64_t output_width;
+    int64_t output_height;
+    int64_t output_stride_w;
+    int64_t output_stride_h;
+    //          output_channels not included as they do not affect the input.
+    int64_t padding_top;
+    int64_t padding_left;
+    float   padding_value;
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/assembly/gemm_common.hpp b/src/core/NEON/kernels/assembly/gemm_common.hpp
index e9e56842c7..e1fb7a45a8 100644
--- a/src/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/src/core/NEON/kernels/assembly/gemm_common.hpp
@@ -23,6 +23,7 @@
  */
 #pragma once
 
+#include "convolution_parameters.hpp"
 #include "ndrange.hpp"
 
 #include <cstddef>
@@ -77,7 +78,7 @@ public:
         return false;
     }
 
-    /** Main execute member function
+    /** Main execute member fucntion
      * @param [in] work_range     specifies the range of work we want to be computed, total range defined by get_window_size()
      * @param [in] thread_locator where are we inside of the thread space
      * @naram [in] threadid       a unique threadid
@@ -123,6 +124,19 @@ public:
     {
     }
 
+    /*** Indirect interface (optional) ***/
+    /* Set the indirect table.  This comprises a number of values per kernel point, and a densely packed array of pointers,
+     * multis * batches * kernel_points */
+    virtual void set_indirect_parameters_generic(size_t, const void *const *const *)
+    {
+    }
+
+    /*** Convolution interface (optional) ***/
+    /* Set the convolution parameters. */
+    virtual void set_convolution_parameters(ConvolutionParameters)
+    {
+    }
+
     // Destructor
     virtual ~IGemmCommon()
     {
@@ -200,6 +214,16 @@ public:
     {
         pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
     }
+
+    /*** Indirect interface ***/
+    virtual void set_indirect_parameters(size_t, const To *const *const *)
+    {
+    }
+
+    void set_indirect_parameters_generic(size_t sz, const void *const *const *ptr) override
+    {
+        set_indirect_parameters(sz, reinterpret_cast<const To *const *const *>(ptr));
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 901b1e880e..cc5f160787 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -27,27 +27,12 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NECol2ImKernel.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NECopyKernel.h"
-#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
-#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
-#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
-#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/NEON/kernels/NEIm2ColKernel.h"
-#include "src/core/NEON/kernels/NEPadLayerKernel.h"
-#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
+
 #include "support/MemorySupport.h"
 
 #include <cmath>
@@ -71,6 +56,7 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
     ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
                                                             enable_fast_math, num_groups));
 
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
     switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
@@ -87,6 +73,13 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
             _function = std::move(f);
             break;
         }
+        case ConvolutionMethod::GEMM_CONV2D:
+        {
+            auto f = arm_compute::support::cpp14::make_unique<NEGEMMConv2d>(_memory_manager);
+            f->configure(input, weights, biases, output, info);
+            _function = std::move(f);
+            break;
+        }
         case ConvolutionMethod::DIRECT:
         {
             auto f = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayer>(_memory_manager);
@@ -112,22 +105,22 @@ Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on NEON");
 
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
     switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
-            //Validate Winograd
             ARM_COMPUTE_RETURN_ON_ERROR(NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
             break;
         case ConvolutionMethod::GEMM:
-            //Validate Gemm-based Convolution
             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
             break;
+        case ConvolutionMethod::GEMM_CONV2D:
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConv2d::validate(input, weights, biases, output, info));
+            break;
         case ConvolutionMethod::DIRECT:
-            //Validate Direct Convolution
             ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
             break;
         case ConvolutionMethod::FFT:
-            // Validate FFT-based convolution layer
             ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
             break;
         default:
@@ -149,6 +142,8 @@ ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
     const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
 
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1);
+
     /* Input spatial dims, kernel size, IFM/OFM, conv info*/
     using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
     using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
@@ -235,7 +230,21 @@ ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *
             }
         }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+        // For 1x1 convolutions run the default GEMM
+        if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
+        {
+            return ConvolutionMethod::GEMM;
+        }
+
+        if(bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
+        {
+            return ConvolutionMethod::WINOGRAD;
+        }
+        if(bool(NEGEMMConv2d::validate(input, weights, nullptr, output, info)))
+        {
+            return ConvolutionMethod::GEMM_CONV2D;
+        }
+        return ConvolutionMethod::GEMM;
     }
 }
 
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 0215098792..9f52e458d2 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -47,7 +47,19 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace arm_compute
 {
-NEGEMM::~NEGEMM() = default;
+namespace
+{
+AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+{
+    AsmGemmInfo asm_info;
+    asm_info.method                  = AsmConvMethod::Im2Col;
+    asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+    asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
+    asm_info.activation_info         = info.activation_info();
+
+    return asm_info;
+}
+} // namespace
 
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(),
@@ -56,12 +68,15 @@ NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *
 {
 }
 
+NEGEMM::~NEGEMM() = default;
+
 void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
 
-    const bool is_c_bias     = gemm_info.reshape_b_only_on_first_run();
-    bool       run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), gemm_info));
+    const AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
+    const bool        is_c_bias     = gemm_info.reshape_b_only_on_first_run();
+    bool              run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), asm_info));
 
     // Check if we need to reshape the matrix B only on the first run
     _is_prepared                      = false;
@@ -76,7 +91,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
     if(run_optimised)
     {
         const ITensor *c_to_use = is_c_bias ? c : nullptr;
-        _asm_glue.configure(a, b, c_to_use, d, gemm_info);
+        _asm_glue.configure(a, b, c_to_use, d, asm_info);
         ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured());
 
         // Scale product by alpha
@@ -221,7 +236,8 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     }
 
     // Check if we need to run the optimized assembly kernel
-    const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, gemm_info));
+    AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
+    const bool  run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, asm_info));
 
     if(!run_optimised)
     {
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 5b0848398d..400fa64438 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -25,18 +25,70 @@
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
 #include "src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h"
 #include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
 
 #include "support/MemorySupport.h"
 
 #include <arm_neon.h>
+#include <cstdlib>
 
 namespace arm_compute
 {
 namespace
 {
+struct free_delete
+{
+    void operator()(void *x)
+    {
+        free(x);
+    }
+};
+
+struct Params
+{
+    unsigned int M;
+    unsigned int N;
+    unsigned int K;
+    unsigned int batches;
+    unsigned int multis;
+    unsigned int sections;
+    bool         indirect;
+};
+
+Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *d, const AsmGemmInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+
+    Params p;
+    p.K        = a->info()->tensor_shape().x();
+    p.N        = d->info()->tensor_shape().x();
+    p.multis   = 1;
+    p.indirect = false;
+    p.sections = 1;
+
+    if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
+    {
+        p.indirect = true;
+        p.sections = b->info()->tensor_shape()[2] * b->info()->tensor_shape()[3];
+    }
+    else
+    {
+        p.M       = d->info()->tensor_shape().y();
+        p.multis  = b->info()->tensor_shape().z();
+        p.batches = d->info()->tensor_shape().total_size_upper(2) / p.multis; //COMPMID-1423: Agree on and document the layout of gemm inputs/outputs
+    }
+
+    // Update M in case of GEMM3D for output
+    if(info.depth_output_gemm3d != 0)
+    {
+        p.M       = d->info()->tensor_shape().y() * d->info()->tensor_shape().z();
+        p.batches = d->info()->tensor_shape().total_size_upper(3) / p.multis;
+    }
+
+    return p;
+}
+
 arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
 {
     arm_gemm::Activation gemm_act;
@@ -69,6 +121,29 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
     return gemm_act;
 }
 
+IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type)
+{
+    // Schedule assembly kernel
+    const int         granule_threshold = 200;
+    IScheduler::Hints scheduling_hint   = IScheduler::Hints(Window::DimX);
+    if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
+    {
+        scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
+    }
+    else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8))
+    {
+        //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
+        scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+    }
+    else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
+    {
+        //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
+        scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+    }
+
+    return scheduling_hint;
+}
+
 template <typename TypeInput, typename TypeOutput>
 class FallbackTransform : public ITransformWeights
 {
@@ -165,7 +240,7 @@ public:
      * @param[in]  os              Output stage meta-data.
      */
     void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
-                   arm_gemm::GemmArgs args, const GEMMInfo &gemm_info,
+                   arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
                    MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {});
 
     /** Set requantization shifts to be used
@@ -198,6 +273,16 @@ private:
      * @param[in] alignment      Workspace memory alignment.
      */
     void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment);
+    /** Configure the indirect buffer
+     *
+     * @param[in]  a    Input tensor containing the Matrix A.
+     * @param[in]  b    Input tensor containing the Matrix B.
+     * @param[out] d    Output tensor to store the result of matrix multiplication.
+     * @param[in]  info GEMM meta-data
+     */
+    void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info);
+    /** Prepare the indirect buffer */
+    void prepare_indirect_buffer();
 
     /** Assembly Gemm kernel */
     std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
@@ -226,7 +311,7 @@ private:
     /** Prepared flag */
     bool _is_prepared{ false };
     /** GEMM meta-data */
-    GEMMInfo _gemm_info{};
+    AsmGemmInfo _gemm_info{};
     /** Weights manager */
     IWeightsManager *_weights_manager{ nullptr };
     /** Weights transform object */
@@ -239,11 +324,16 @@ private:
     std::vector<int32_t> left_shifts{};
     /** Per channel quantization multipliers */
     std::vector<int32_t> _multipliers{};
+    /** Indirect buffer */
+    std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
+    std::unique_ptr<const TypeInput *, free_delete>        _indirect_buf{};
+    std::vector<TypeInput>          _indirect_pad{};
+    arm_gemm::ConvolutionParameters _cp{};
 };
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts,
-        const std::vector<int32_t> &multipliers)
+std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
+Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers)
 {
     _multipliers   = multipliers;
     _shifts        = shifts;
@@ -261,8 +351,122 @@ std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> Fallback<Typ
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer()
+{
+    const TypeInput *A_ptr          = reinterpret_cast<TypeInput *>(_a->buffer());
+    const int        multis         = 1;
+    const int        batches        = _a->info()->tensor_shape().total_size_upper(3);
+    const size_t     stride_A       = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+    const size_t     batch_stride_A = _a->info()->strides_in_bytes()[3] / sizeof(TypeInput);
+    const size_t     multi_stride_A = _a->info()->strides_in_bytes()[4] / sizeof(TypeInput);
+
+    const size_t output_hw    = _cp.output_height * _cp.output_width;
+    const int    batch_size   = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput);
+    const size_t batch_stride = batch_size / sizeof(TypeInput);
+    const int    multi_size   = batch_size * batches;
+    const size_t multi_stride = multi_size / sizeof(TypeInput);
+
+    for(int64_t m = 0; m < multis; m++)
+    {
+        for(int64_t b = 0; b < batches; b++)
+        {
+            for(int64_t output_y = 0; output_y < _cp.output_height; output_y++)
+            {
+                for(int64_t output_x = 0; output_x < _cp.output_width; output_x++)
+                {
+                    int64_t output_xy = (output_y * _cp.output_width) + output_x;
+
+                    for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
+                    {
+                        for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
+                        {
+                            int64_t input_x   = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
+                            int64_t input_y   = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
+                            int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
+                            int64_t input_xy  = (input_y * _cp.input_width) + input_x;
+
+                            if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
+                            {
+                                _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
+                            }
+                            else
+                            {
+                                _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+                                    A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect));
+
+    float zeropad = 0.f;
+    if(is_data_type_quantized(a->data_type()))
+    {
+        zeropad = a->quantization_info().uniform().offset;
+    }
+
+    const int64_t input_width    = static_cast<int64_t>(a->tensor_shape()[1]);
+    const int64_t input_height   = static_cast<int64_t>(a->tensor_shape()[2]);
+    const int64_t input_channels = static_cast<int64_t>(a->tensor_shape()[0]);
+    const int64_t kernel_width   = static_cast<int64_t>(b->tensor_shape()[2]);
+    const int64_t kernel_height  = static_cast<int64_t>(b->tensor_shape()[3]);
+    const int64_t output_width   = static_cast<int64_t>(d->tensor_shape()[1]);
+    const int64_t output_height  = static_cast<int64_t>(d->tensor_shape()[2]);
+
+    _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height,
+            info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
+          };
+
+    if(info.method == AsmConvMethod::Conv)
+    {
+        _gemm_kernel_asm->set_convolution_parameters(_cp);
+    }
+
+    if(info.method == AsmConvMethod::Indirect)
+    {
+        const unsigned int multis    = 1;
+        const unsigned int batches   = a->tensor_shape().total_size_upper(3);
+        const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height;
+        const unsigned int output_hw = _cp.output_width * _cp.output_height;
+
+        using TypeInputPtr        = TypeInput *;
+        const int    batch_size   = kernel_hw * output_hw * sizeof(TypeInputPtr);
+        const size_t batch_stride = batch_size / sizeof(TypeInputPtr);
+        const int    multi_size   = batch_size * batches;
+        const size_t multi_stride = multi_size / sizeof(TypeInputPtr);
+
+        _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
+        _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
+        _indirect_pad = std::vector<TypeInput>(_cp.input_channels, zeropad);
+
+        // Set indirect argument
+        int64_t pos = 0;
+        for(int64_t m = 0; m < multis; m++)
+        {
+            for(int64_t b = 0; b < batches; b++)
+            {
+                for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
+                {
+                    (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
+                }
+            }
+        }
+
+        _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get());
+    }
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
 void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
-                                                             arm_gemm::GemmArgs args, const GEMMInfo &gemm_info,
+                                                             arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
                                                              MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os)
 {
     arm_gemm::GemmConfig gemm_cfg;
@@ -325,6 +529,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c
             static_cast<Tensor *>(_pretranspose)->allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
         }
     }
+
+    // Handle indirect GEMM convolution
+    if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
+    {
+        configure_indirect(a->info(), b->info(), d->info(), gemm_info);
+    }
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
@@ -365,6 +575,11 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare()
             }
         }
 
+        if(_gemm_info.method == AsmConvMethod::Indirect)
+        {
+            prepare_indirect_buffer();
+        }
+
         _is_prepared = true;
     }
 }
@@ -387,23 +602,23 @@ bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const
 template <typename TypeInput, typename TypeOutput, class OutputStage>
 void Fallback<TypeInput, TypeOutput, OutputStage>::run()
 {
-    const int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+    int       lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
     int       ldb = 0;
     const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput);
 
-    const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d() != 0 ? 3 : 2;
+    const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2;
     const size_t a_multi_idx = a_batch_idx + 1;
-    const size_t d_batch_idx = _gemm_info.depth_output_gemm3d() != 0 ? 3 : 2;
+    const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2;
     const size_t d_multi_idx = d_batch_idx + 1;
 
-    const int batch_stride_a = _a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput);
+    int       batch_stride_a = _a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput);
     const int batch_stride_d = _d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput);
 
-    const int multi_stride_a = _a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput);
+    int       multi_stride_a = _a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput);
     int       multi_stride_b = 0;
     const int multi_stride_d = _d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput);
 
-    const auto       in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes());
+    auto             in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes());
     const TypeInput *in1_ptr = nullptr;
     auto             out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer() + _d->info()->offset_first_element_in_bytes());
 
@@ -415,25 +630,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run()
         in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
     }
 
-    IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
-    if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && _d->info()->data_type() == DataType::F32)
-    {
-        const int granule_threshold = 200;
-        scheduling_hint             = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
-    }
-    else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (_d->info()->data_type() == DataType::F32 || _d->info()->data_type() == DataType::F16
-                                                                                 || _d->info()->data_type() == DataType::U8 || _d->info()->data_type() == DataType::S8))
-    {
-        //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
-        const int granule_threshold = 200;
-        scheduling_hint             = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
-    }
-    else if(_kernel_info.method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (_d->info()->data_type() == DataType::QASYMM8 || _d->info()->data_type() == DataType::QASYMM8_SIGNED))
-    {
-        //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
-        const int granule_threshold = 200;
-        scheduling_hint             = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
-    }
+    const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, _d->info()->data_type());
 
     // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
     if(_workspace.buffer() != nullptr)
@@ -458,57 +655,67 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run()
     // Prepare assembly kernel
     prepare();
 
-    TypeOutput *bias = nullptr;
     // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
+    TypeOutput *bias = nullptr;
     if(_c && _c->info()->data_type() != DataType::S32)
     {
         bias = reinterpret_cast<TypeOutput *>(_c->buffer() + _c->info()->offset_first_element_in_bytes());
     }
+
+    if(_gemm_info.method == AsmConvMethod::Indirect)
+    {
+        in0_ptr        = nullptr;
+        lda            = 0;
+        batch_stride_a = 0;
+        multi_stride_a = 0;
+    }
+
     // Set gemm parameters
     _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
                                  in1_ptr, ldb, multi_stride_b,
                                  out_ptr, ldd, batch_stride_d, multi_stride_d,
                                  bias, 0);
-    // Schedule assembly kernel
+    // Schedule
     NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
 }
 
 template <typename TypeInput, typename TypeOutput>
 void create_arm_gemm(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
-                     const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info,
+                     const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
                      IWeightsManager *weights_manager)
 {
-    INEGEMMWrapperKernel::Params p           = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
-    const CPUInfo               &ci          = NEScheduler::get().cpu_info();
-    unsigned int                 num_threads = NEScheduler::get().num_threads();
+    Params         p           = extract_parameters(a, b, d, info);
+    const CPUInfo &ci          = NEScheduler::get().cpu_info();
+    unsigned int   num_threads = NEScheduler::get().num_threads();
 
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, activation, num_threads);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
 
     // Create arm_gemm fallback
     auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput>>();
-    fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager);
+    fallback->configure(a, b, c, d, args, info, memory_group, weights_manager);
     arm_gemm = std::move(fallback);
 }
 
 template <typename TypeInput, typename TypeOutput>
 void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
-                           const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info,
+                           const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
                            IWeightsManager *weights_manager)
 {
     ARM_COMPUTE_UNUSED(activation);
-    INEGEMMWrapperKernel::Params p           = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
-    const CPUInfo               &ci          = NEScheduler::get().cpu_info();
-    unsigned int                 num_threads = NEScheduler::get().num_threads();
+    Params         p           = extract_parameters(a, b, d, info);
+    const CPUInfo &ci          = NEScheduler::get().cpu_info();
+    unsigned int   num_threads = NEScheduler::get().num_threads();
 
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, activation, num_threads);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
 
     // Create arm_gemm fallback
     auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
 
     // Configure requantization info
-    const int32_t                 a_offset = -a->info()->quantization_info().uniform().offset;
-    const int32_t                 b_offset = -b->info()->quantization_info().uniform().offset;
-    const GEMMLowpOutputStageInfo os_info  = gemm_info.gemmlowp_output_stage();
+    const int32_t                 negation = info.negated_offsets ? 1 : -1;
+    const int32_t                 a_offset = -a->info()->quantization_info().uniform().offset * negation;
+    const int32_t                 b_offset = -b->info()->quantization_info().uniform().offset * negation;
+    const GEMMLowpOutputStageInfo os_info  = info.output_stage;
 
     arm_gemm::Requantize32 gemm_requant_info{};
     if(os_info.gemmlowp_shifts.size() > 1)
@@ -530,7 +737,7 @@ void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &a
     }
 
     // Configure fallback
-    fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager, gemm_requant_info);
+    fallback->configure(a, b, c, d, args, info, memory_group, weights_manager, gemm_requant_info);
     arm_gemm = std::move(fallback);
 }
 
@@ -541,14 +748,13 @@ NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> m
 {
 }
 
-Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info)
+Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
 {
-    ARM_COMPUTE_UNUSED(c);
+    ARM_COMPUTE_UNUSED(c, info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.pretranpose_B());
 #ifndef __aarch64__
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
 #endif /* __aarch64__ */
@@ -579,13 +785,13 @@ bool NEGEMMAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &
     return act.type != arm_gemm::Activation::Type::None;
 }
 
-void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info)
+void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
-    arm_gemm::Activation act = map_to_arm_gemm_activation(gemm_info.activation_info());
+    arm_gemm::Activation act = map_to_arm_gemm_activation(info.activation_info);
 
     //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), gemm_info))
+    if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), info))
     {
         return;
     }
@@ -593,40 +799,40 @@ void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const
     switch(a->info()->data_type())
     {
         case DataType::F32:
-            create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+            create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             break;
 #ifdef __aarch64__
         case DataType::U8:
         case DataType::QASYMM8:
             if(d->info()->data_type() == DataType::S32)
             {
-                create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+                create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             }
             else
             {
-                create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+                create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             }
             break;
         case DataType::S8:
         case DataType::QASYMM8_SIGNED:
             if(d->info()->data_type() == DataType::S32)
             {
-                create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+                create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             }
             else
             {
-                create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+                create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             }
             break;
 #endif /* __aarch64__ */
 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
         case DataType::BFLOAT16:
-            create_arm_gemm<bfloat16, float>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+            create_arm_gemm<bfloat16, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             break;
 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+            create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
new file mode 100644
index 0000000000..642b084fb4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include <set>
+namespace arm_compute
+{
+namespace
+{
+GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const ActivationLayerInfo &act)
+{
+    // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo        iqinfo    = input->quantization_info();
+    const QuantizationInfo        wqinfo    = weights->quantization_info();
+    const QuantizationInfo        oqinfo    = (output->total_size() == 0) ? iqinfo : output->quantization_info();
+    const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
+    const DataType                data_type = input->data_type();
+    // Merge activation with output stage
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                             };
+    PixelValue type_min{};
+    PixelValue type_max{};
+    std::tie(type_min, type_max) = get_min_max(data_type);
+    int32_t min_activation = type_min.get<int32_t>();
+    int32_t max_activation = type_max.get<int32_t>();
+    if(supported_acts.count(act.activation()) != 0)
+    {
+        std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
+    }
+    GEMMLowpOutputStageInfo os_info;
+    os_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    os_info.gemmlowp_offset          = uoqinfo.offset;
+    os_info.gemmlowp_min_bound       = min_activation;
+    os_info.gemmlowp_max_bound       = max_activation;
+    os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
+    quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info);
+    return os_info;
+}
+AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect)
+{
+    AsmGemmInfo asm_info;
+    asm_info.method                  = is_indirect ? AsmConvMethod::Indirect : AsmConvMethod::Conv;
+    asm_info.ps_info                 = info.conv_info;
+    asm_info.activation_info         = info.act_info;
+    asm_info.depth_output_gemm3d     = true;
+    asm_info.reinterpret_input_as_3d = true;
+    asm_info.padding_top             = info.conv_info.pad_top();
+    asm_info.padding_left            = info.conv_info.pad_left();
+    asm_info.padding_value           = 0.f;
+    asm_info.negated_offsets         = false;
+    return asm_info;
+}
+} // namespace
+
+NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
+    : _gemm_asm_func(memory_manager), _activation_func(), _weights_permute_func(), _original_weights(nullptr), _permuted_weights(), _is_prepared(false), _run_activation(false)
+{
+}
+void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConv2d::validate(input->info(),
+                                                      weights->info(),
+                                                      biases != nullptr ? biases->info() : nullptr,
+                                                      output->info(),
+                                                      info));
+    _original_weights = weights;
+    _weights_permute_func.configure(weights, &_permuted_weights, PermutationVector{ 3, 0, 1, 2 });
+
+    // Configure assembly dispatch
+    AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+    if(is_data_type_quantized(input->info()->data_type()))
+    {
+        asm_info.output_stage = calculate_output_stage_metadata(input->info(), weights->info(), output->info(), info.act_info);
+    }
+    _gemm_asm_func.configure(input, &_permuted_weights, biases, output, asm_info);
+
+    // Configure activation
+    if(info.act_info.enabled() && !_gemm_asm_func.is_activation_supported(info.act_info))
+    {
+        _activation_func.configure(output, nullptr, info.act_info);
+        _run_activation = true;
+    }
+}
+Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC");
+    const DataType    data_type = input->data_type();
+    const TensorShape i_shape   = input->tensor_shape();
+    const TensorShape w_shape   = weights->tensor_shape();
+    ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+    // Validate biases
+    if(biases != nullptr)
+    {
+        if(is_data_type_quantized_asymmetric(data_type))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else if(data_type == DataType::BFLOAT16)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMAssemblyDispatch::validate(input, weights, biases, output, asm_info));
+    return Status{};
+}
+void NEGEMMConv2d::run()
+{
+    prepare();
+
+    _gemm_asm_func.run();
+    if(_run_activation)
+    {
+        _activation_func.run();
+    }
+}
+void NEGEMMConv2d::prepare()
+{
+    if(!_is_prepared)
+    {
+        _permuted_weights.allocator()->allocate();
+        _weights_permute_func.run();
+        _original_weights->mark_as_unused();
+        _is_prepared = true;
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
deleted file mode 100644
index 09637dd2d6..0000000000
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "support/MemorySupport.h"
-
-namespace arm_compute
-{
-NEGEMMLowpAssemblyMatrixMultiplyCore::~NEGEMMLowpAssemblyMatrixMultiplyCore() = default;
-
-NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b()
-{
-}
-
-void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::S8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
-    ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
-
-    bool run_optimised = false;
-    switch(a->info()->data_type())
-    {
-        case DataType::S8:
-        case DataType::QASYMM8:
-        case DataType::U8:
-        {
-            _asm_glue.configure(a, b, c, output, GEMMInfo(false, false, true));
-            run_optimised = _asm_glue.is_configured();
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Datatype not supported");
-            break;
-        }
-    }
-    if(!run_optimised)
-    {
-        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        TensorShape shape_tmp_a = a->info()->tensor_shape();
-        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
-
-        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-        TensorShape shape_tmp_b = b->info()->tensor_shape();
-        shape_tmp_b.set(0, b->info()->dimension(1) * 16);
-        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
-
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
-        _tmp_a.allocator()->init(info_a);
-        _tmp_b.allocator()->init(info_b);
-        _memory_group.manage(&_tmp_a);
-        _memory_group.manage(&_tmp_b);
-
-        // Configure interleave kernel
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
-            k->configure(a, &_tmp_a);
-            _mtx_a_reshape_kernel = std::move(k);
-        }
-
-        // Configure transpose kernel
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
-            k->configure(b, &_tmp_b);
-            _mtx_b_reshape_kernel = std::move(k);
-        }
-
-        // Configure matrix multiply kernel
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-            k->configure(&_tmp_a, &_tmp_b, output);
-            _mm_kernel = std::move(k);
-        }
-
-        // Allocate tensors
-        _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
-    }
-}
-
-void NEGEMMLowpAssemblyMatrixMultiplyCore::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-    if(_mtx_a_reshape_kernel)
-    {
-        NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
-    }
-
-    if(_mtx_b_reshape_kernel)
-    {
-        NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
-    }
-
-    if(_asm_glue.is_configured())
-    {
-        _asm_glue.run();
-    }
-    else
-    {
-        NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
-    }
-}
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 9050427b34..df8eaacf47 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -47,6 +47,21 @@
 
 namespace arm_compute
 {
+namespace
+{
+AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+{
+    AsmGemmInfo asm_info;
+    asm_info.method                  = AsmConvMethod::Im2Col;
+    asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+    asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
+    asm_info.activation_info         = info.activation_info();
+    asm_info.output_stage            = info.gemmlowp_output_stage();
+
+    return asm_info;
+}
+} // namespace
+
 using namespace arm_compute::misc::shape_calculator;
 
 NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
@@ -120,6 +135,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
         _mm_result_s32.allocator()->init(info_mm_result_s32);
     }
 
+    // Initialize assembly kernel meta-data
+    const AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
 #ifdef __aarch64__
     switch(a->info()->data_type())
     {
@@ -130,12 +147,12 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
         {
             if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
             {
-                _asm_glue.configure(a_to_use, b, c, output, gemm_info);
+                _asm_glue.configure(a_to_use, b, c, output, asm_info);
                 _fused_assembly_path = _asm_glue.is_configured();
             }
             else
             {
-                _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
+                _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, asm_info);
             }
             _assembly_path = _asm_glue.is_configured();
             break;
@@ -346,17 +363,20 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
         matrix_a_info = &signed_a;
     }
 
+    // Initialize assembly kernel meta-data
+    const AsmGemmInfo asm_info = init_assembly_metadata(info);
+
     // Check if we need to run the optimized assembly kernel
     bool run_optimised             = false;
     bool run_optimised_requantized = false;
     if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
+        run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
         run_optimised_requantized = run_optimised;
     }
     else
     {
-        run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
+        run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
     }
 
     if(run_optimised)
diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
deleted file mode 100644
index d165b2235c..0000000000
--- a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/NEON/functions/NESimpleAssemblyFunction.h"
-
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NESimpleAssemblyFunction::NESimpleAssemblyFunction() // NOLINT
-    : _kernel()
-{
-}
-
-void NESimpleAssemblyFunction::run()
-{
-    NEScheduler::get().schedule(_kernel.get(), Window::DimX);
-}
-
-void NESimpleAssemblyFunction::configure(std::unique_ptr<INEGEMMWrapperKernel> kernel)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(kernel.get());
-    _kernel = std::move(kernel);
-    ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(_kernel->window(), 1);
-}
diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.h b/src/runtime/NEON/functions/NESimpleAssemblyFunction.h
deleted file mode 100644
index e9be54d35f..0000000000
--- a/src/runtime/NEON/functions/NESimpleAssemblyFunction.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H
-#define ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H
-
-#include "arm_compute/runtime/IFunction.h"
-#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-/** Basic interface for functions which have a single NEON GEMM wrapper kernel to run */
-class NESimpleAssemblyFunction : public IFunction
-{
-public:
-    /** Constructor */
-    NESimpleAssemblyFunction();
-
-    /** Configure the function with the kernel to run
-     *
-     * @param[in] kernel GEMM Wrapper kernel configured and ready to run
-     *
-     * @note The kernel is expected to have a 1D window. The function will multi-thread this window across the X dimension.
-     */
-    void configure(std::unique_ptr<INEGEMMWrapperKernel> kernel);
-
-    // Inherited methods overridden:
-    void run() override final;
-
-protected:
-    std::unique_ptr<INEGEMMWrapperKernel> _kernel; /**< Kernel to run */
-};
-} //namespace arm_compute
-#endif /*ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H */
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 80615c5d57..112188fdfa 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -45,6 +46,20 @@ namespace test
 {
 namespace validation
 {
+namespace detail
+{
+template <>
+void configure_conv_function<NEGEMMConv2d, Tensor>(NEGEMMConv2d &func,
+                                                   Tensor *src, const Tensor *weights, const Tensor *bias, Tensor *dst,
+                                                   const PadStrideInfo &info, const WeightsInfo &weights_info,
+                                                   const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+    ARM_COMPUTE_UNUSED(weights_info);
+
+    Conv2dInfo conv_info(info, dilation, act_info, false, num_groups);
+    func.configure(src, weights, bias, dst, conv_info);
+}
+} // namespace detail
 namespace
 {
 const RelativeTolerance<float> rel_tolerance_f32(0.01f);              /**< Relative tolerance for FP32 types */
@@ -368,7 +383,7 @@ TEST_SUITE_END() // WinogradLayer
 
 TEST_SUITE(GEMMConvolutionLayer)
 template <typename T>
-using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
+using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T>;
 
 TEST_SUITE(Float)
 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
@@ -413,10 +428,10 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 template <typename T>
-using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
+using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEConvolutionLayer, T>;
 
 template <typename T>
-using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T, int8_t>;
+using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEConvolutionLayer, T, int8_t>;
 
 const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
 {
@@ -480,6 +495,82 @@ TEST_SUITE_END() // QSYMM8_PER_CHANNEL
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // GEMMConvolutionLayer
+
+TEST_SUITE(DirectGEMMConv2d)
+template <typename T>
+using NEDirectGEMMConv2dLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEGEMMConv2d, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                     framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                             ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+
+template <typename T>
+using NEDirectGEMMConv2dLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConv2d, T>;
+
+template <typename T>
+using NEDirectGEMMConv2dLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEGEMMConv2d, T, int8_t>;
+
+const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                        framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                                                                                                        QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })),
+                                                                                                                       QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE(QSYMM8_PER_CHANNEL)
+FIXTURE_DATA_TEST_CASE(RunSmallSigned, NEDirectGEMMConv2dLayerQuantizedPerChannelFixture<int8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                               framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
+                                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                               QuantizationData),
+                                       QuantizedActivationFunctionsDataset),
+                               framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QSYMM8_PER_CHANNEL
+TEST_SUITE_END() // Quantized
+
+TEST_SUITE_END() // DirectGEMMConv2d
+
 TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 9fe7e55de7..04282c2c3c 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -53,28 +52,6 @@ const auto data_matrix_multiply = framework::dataset::make("M", 12, 20) * framew
 } // namespace
 
 TEST_SUITE(NEON)
-TEST_SUITE(ASSEMBLY_MATRIX_MULTIPLY)
-
-using NEGEMMAssemblyFixture_S8 = GEMMLowpAssemblyFixture<Tensor, Accessor, NEGEMMLowpAssemblyMatrixMultiplyCore, int8_t>;
-using NEGEMMAssemblyFixture_U8 = GEMMLowpAssemblyFixture<Tensor, Accessor, NEGEMMLowpAssemblyMatrixMultiplyCore, uint8_t>;
-
-TEST_SUITE(S8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAssemblyFixture_S8, framework::DatasetMode::PRECOMMIT, data_matrix_multiply)
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END()
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAssemblyFixture_U8, framework::DatasetMode::PRECOMMIT, data_matrix_multiply)
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-
 TEST_SUITE(GEMMLowp)
 TEST_SUITE(MatrixMultiplyCore)
 using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index ec13e1d3e0..e1452f5dfc 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -42,12 +42,22 @@
 
 namespace arm_compute
 {
-class NEConvolutionLayer;
-
 namespace test
 {
 namespace validation
 {
+namespace detail
+{
+template <typename ConvolutionFunction, typename TensorType>
+void configure_conv_function(ConvolutionFunction &func,
+                             TensorType *src, const TensorType *weights, const TensorType *bias, TensorType *dst,
+                             const PadStrideInfo &info, const WeightsInfo &weights_info,
+                             const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+    func.configure(src, weights, bias, dst, info, weights_info, dilation, act_info, num_groups);
+}
+} // namespace detail
+
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename TW>
 class ConvolutionValidationGenericFixture : public framework::Fixture
 {
@@ -171,7 +181,7 @@ protected:
 
         // Create and configure function
         FunctionType conv;
-        conv.configure(&src, &weights, &bias, &dst, info, weights_info, dilation, act_info, num_groups);
+        detail::configure_conv_function(conv, &src, &weights, &bias, &dst, info, weights_info, dilation, act_info, num_groups);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
author	Georgios Pinitas <georgios.pinitas@arm.com>	2020-11-02 01:37:17 +0000
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2020-11-12 15:59:25 +0000
commit	c0b6f76561580414f08633a804fc548ccad65659 (patch)
tree	4d46b7f479de04f799e29095392948aeb370c029
parent	824061d9910ebb42cbe46b677c0b843db212c9a2 (diff)
download	ComputeLibrary-c0b6f76561580414f08633a804fc548ccad65659.tar.gz