aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2020-07-02 20:02:20 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2020-07-06 16:51:32 +0000
commit5aa1a0b7ca5eed010e4b297a95b1c4851f741328 (patch)
treeba882de9e86589dfdd33937d538a89bbdf01c40e
parent42550c039105597ff6acd4e5efc0ee3c7c20b08e (diff)
downloadComputeLibrary-5aa1a0b7ca5eed010e4b297a95b1c4851f741328.tar.gz
COMPID-3324: Clean GEMM kernels
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I170de1671e061a78740caee31fb4a1b8642c1369 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3505 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
-rw-r--r--Android.bp8
-rw-r--r--SConstruct2
-rw-r--r--arm_compute/core/CPP/CPPTypes.h9
-rw-r--r--src/core/NEON/kernels/arm_gemm/barrier.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/bfloat.hpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/bias_adder.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/buffer_manager.hpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp30
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp26
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp23
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int16.cpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int8.cpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp21
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_native.hpp16
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_batched.hpp10
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp347
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp348
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp350
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp1810
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp21
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp21
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp22
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp22
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp10
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp38
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp328
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp53
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp30
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp30
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp354
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp89
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp3459
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp32
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp33
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp31
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp72
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp397
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp31
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp32
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp31
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp32
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp249
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp10
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp295
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp10
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp199
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp183
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp183
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/list.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp13
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp13
-rw-r--r--src/core/NEON/kernels/arm_gemm/misc.cpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/quantized.cpp10
-rw-r--r--src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transform.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/list.hpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp552
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/utils.hpp4
-rw-r--r--src/core/NEON/kernels/assembly/arm_gemm.hpp106
-rw-r--r--src/core/NEON/kernels/assembly/gemm_common.hpp150
-rw-r--r--src/core/NEON/kernels/assembly/ndrange.hpp158
-rw-r--r--src/runtime/CPUUtils.cpp5
-rw-r--r--src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp4
145 files changed, 9423 insertions, 1598 deletions
diff --git a/Android.bp b/Android.bp
index 0c0f3b22ad..d19db113d5 100644
--- a/Android.bp
+++ b/Android.bp
@@ -776,21 +776,26 @@ cc_library_static {
"src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp",
@@ -799,6 +804,7 @@ cc_library_static {
"src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp",
@@ -817,12 +823,14 @@ cc_library_static {
"src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp",
diff --git a/SConstruct b/SConstruct
index 668e9a73d9..2832b90afe 100644
--- a/SConstruct
+++ b/SConstruct
@@ -206,7 +206,7 @@ elif 'v8' in env['arch']:
env.Append(CXXFLAGS = ['-march=armv8-a'])
if 'v8.6-a' in env['arch']:
- env.Append(CPPDEFINES = ['V8P6', 'V8P6_BF', 'ARM_COMPUTE_FORCE_BF16'])
+ env.Append(CPPDEFINES = ['MMLA_INT8', 'MMLA_FP32', 'V8P6', 'V8P6_BF', 'ARM_COMPUTE_FORCE_BF16'])
elif 'x86' in env['arch']:
if env['estate'] == '32':
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index d3f6fc944d..59aecd2176 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,7 +44,8 @@ enum class CPUModel
GENERIC_FP16_DOT,
A53,
A55r0,
- A55r1
+ A55r1,
+ X1
};
/** Global memory policy.
@@ -94,6 +95,10 @@ inline std::string cpu_model_to_string(CPUModel val)
{
return std::string("A55r1");
}
+ case CPUModel::X1:
+ {
+ return std::string("X1");
+ }
default:
{
ARM_COMPUTE_ERROR("Invalid CPUModel.");
diff --git a/src/core/NEON/kernels/arm_gemm/barrier.hpp b/src/core/NEON/kernels/arm_gemm/barrier.hpp
index cfd1079f74..8fbcddfef8 100644
--- a/src/core/NEON/kernels/arm_gemm/barrier.hpp
+++ b/src/core/NEON/kernels/arm_gemm/barrier.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/bfloat.hpp b/src/core/NEON/kernels/arm_gemm/bfloat.hpp
index 547c668157..e585e59890 100644
--- a/src/core/NEON/kernels/arm_gemm/bfloat.hpp
+++ b/src/core/NEON/kernels/arm_gemm/bfloat.hpp
@@ -29,5 +29,4 @@ namespace arm_gemm {
using bfloat16 = arm_compute::bfloat16;
-} // namespace arm_gemm
-
+} // namespace arm_gemm \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/bias_adder.hpp b/src/core/NEON/kernels/arm_gemm/bias_adder.hpp
index 745d00563b..5d363fd68b 100644
--- a/src/core/NEON/kernels/arm_gemm/bias_adder.hpp
+++ b/src/core/NEON/kernels/arm_gemm/bias_adder.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
index 001cab7f09..268b9ba6c7 100644
--- a/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
+++ b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
@@ -303,32 +303,22 @@ public:
BufferManager(BufferManager &) = delete;
BufferManager & operator=(BufferManager &) = delete;
- BufferManager(const int maxthreads, const size_t buffersize, void *storage) : _storage(storage) {
- UNUSED(maxthreads);
- UNUSED(buffersize);
- }
+ BufferManager(const int, const size_t, void *storage) : _storage(storage) { }
~BufferManager() { }
// Say how much storage is needed.
- static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize) {
- UNUSED(maxthreads);
+ static inline size_t get_storage_requirement(const int, const size_t buffersize) {
return buffersize;
}
template <typename T>
- void try_populate(const int index, T func) {
- UNUSED(index);
- UNUSED(func);
- }
+ void try_populate(const int, T) { }
- void release(const int index) {
- UNUSED(index);
- }
+ void release(const int) { }
template <typename T>
- void *get(const int index, T func) {
- UNUSED(index);
+ void *get(const int, T func) {
func(_storage);
return _storage;
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
index 10fee472f4..fad0e84bbb 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_gemm.hpp"
+#include "bfloat.hpp"
#include "gemm_common.hpp"
#include "gemm_hybrid.hpp"
#include "gemm_implementation.hpp"
@@ -43,11 +44,8 @@
#include "kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp"
#include "kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp"
-#include "bfloat.hpp"
-
namespace arm_gemm {
-
static const GemmImplementation<bfloat16, float> gemm_bf16_methods[] =
{
#ifdef V8P6_BF
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index b6671e8c85..6867a5f4b9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index 8bef2b7bae..1d5b97b41a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,7 +44,9 @@
#include "kernels/a64_sgemv_trans.hpp"
#include "kernels/sve_hybrid_fp32_mla_4VLx4.hpp"
+#include "kernels/sve_hybrid_fp32_mmla_4VLx4.hpp"
#include "kernels/sve_interleaved_fp32_mla_3VLx8.hpp"
+#include "kernels/sve_interleaved_fp32_mmla_3VLx8.hpp"
#include "kernels/sve_native_fp32_mla_4VLx4.hpp"
#include "kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp"
@@ -75,6 +77,23 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
[](const GemmArgs &args) { return new GemvNativeTransposed<sgemv_trans, float, float>(args); }
},
+#if defined(__ARM_FEATURE_SVE) && defined(MMLA_FP32)
+{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_fp32_mmla_4VLx4",
+ [](const GemmArgs &args) { return (args._Ksize >= 4) && !args._trA && args._pretransposed_hint; },
+ [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mmla_4VLx4, float, float>(args); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "interleaved_fp32_mmla_3VLx8",
+ [](const GemmArgs &args) { return (args._Ksize>4); },
+ nullptr,
+ [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mmla_3VLx8, float, float>(args); }
+},
+#endif // __ARM_FEATURE_SVE && MMLA_FP32
+
#ifdef __ARM_FEATURE_SVE
// SVE smallk / native / hybrid methods
{
@@ -124,7 +143,7 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
},
{
GemmMethod::GEMM_HYBRID,
- "hybrid_fp32_mla_16x4_normal",
+ "hybrid_fp32_mla_16x4",
[](const GemmArgs &args) { return (args._Ksize >= 4) && !args._trA && args._pretransposed_hint; },
[](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || (args._Msize < 16) || (args._nmulti > 1); },
[](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_16x4, float, float>(args); }
@@ -146,7 +165,7 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
[](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args); }
},
#endif // __ARM_FEATURE_SVE
-//Pretranpose, 2D split
+// Pretranposed, 2D split
{
GemmMethod::GEMM_INTERLEAVED_2D,
"sgemm_12x8_pretranspose_2d",
@@ -154,7 +173,7 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
[](const GemmArgs &args) { return args._maxthreads >= 8; },
[](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, float, float>(args); }
},
-//Tranpose, 2D split, no blockmanager
+// Non-pretransposed, 2D split (no buffer manager)
{
GemmMethod::GEMM_INTERLEAVED_2D,
"sgemm_12x8_2d",
@@ -162,7 +181,7 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
[](const GemmArgs &args) { return (!args._pretransposed_hint) && (args._maxthreads >= 8); },
[](const GemmArgs &args) { return new GemmInterleaved2d<sgemm_12x8, float, float>(args); }
},
-//Tranpose, 1D split, with blockmanager
+// 1D split (with pretransposed or not)
{
GemmMethod::GEMM_INTERLEAVED,
"sgemm_12x8_1d",
@@ -170,7 +189,6 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
nullptr,
[](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); }
},
-
#endif // __aarch64__
#ifdef __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index 0ce323e09d..2c666b63c2 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,9 +29,8 @@
#include "arm_gemm.hpp"
#include "bias_adder.hpp"
-#include "utils.hpp"
-
#include "ndrange.hpp"
+#include "utils.hpp"
#include "mergeresults.hpp"
#include "transform.hpp"
@@ -144,7 +143,7 @@ public:
// Interface implementation - Compulsory functions
ndrange_t get_window_size() const override {
- return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u };
+ return { _window_range.total_size() };
}
// This kernel can always be dynamically scheduled.
@@ -152,8 +151,8 @@ public:
return true;
}
- void execute_1d(unsigned int start, unsigned int end, int threadid) {
- UNUSED(threadid);
+ // Execute
+ void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
#ifdef CYCLE_PROFILING
profiler prof;
#endif
@@ -174,7 +173,7 @@ public:
const bool first_pass = (k0 == 0);
const bool last_pass = (kmax == _Ksize);
- auto p = _window_range.iterator(start, end);
+ auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
if (p.done()) {
return;
@@ -194,7 +193,7 @@ public:
(n0 * kern_k);
#ifdef CYCLE_PROFILING
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
#endif
strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,
@@ -215,17 +214,6 @@ public:
}
}
- // Execute
- void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
- UNUSED(thread_locator);
-
- const auto start = work_range.get_position(0);
- const auto size = work_range.get_size(0);
- const auto stop = start + size;
-
- execute_1d(start, stop, threadid);
- }
-
// Interface implementation - pretransposed
bool B_is_pretransposed() const override {
return true;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
index 2b936d0b8f..36545c16ba 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,9 +28,8 @@
#include <algorithm>
#include "arm_gemm.hpp"
-#include "utils.hpp"
-
#include "ndrange.hpp"
+#include "utils.hpp"
#include "mergeresults.hpp"
#include "transform.hpp"
@@ -151,7 +150,7 @@ public:
// Interface implementation - Compulsory functions
ndrange_t get_window_size() const override {
- return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u };
+ return { _window_range.total_size() };
}
// This kernel can always be dynamically scheduled.
@@ -159,7 +158,8 @@ public:
return true;
}
- void execute_1d(unsigned int start, unsigned int end, int threadid) {
+ // Execute
+ void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override {
#ifdef CYCLE_PROFILING
profiler prof;
#endif
@@ -180,7 +180,7 @@ public:
unsigned int kmax = std::min(k0 + _k_block, _Ksize);
unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
- auto p = _window_range.iterator(start, end);
+ auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
if (p.done()) {
return;
@@ -234,17 +234,6 @@ public:
}
}
- // Execute
- void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
- UNUSED(thread_locator);
-
- const auto start = work_range.get_position(0);
- const auto size = work_range.get_size(0);
- const auto stop = start + size;
-
- execute_1d(start, stop, threadid);
- }
-
// Working space needed for intermediate result buffers.
size_t get_working_size() const override {
return (_nthreads * strategy::out_height() * _Nsize * sizeof(Tri));
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index 569d1f44ca..c726d7b0aa 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
-#include <arm_gemm.hpp>
+#include "arm_gemm.hpp"
#include <functional>
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index e3b4416f68..da682330a0 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index f7d8f65aea..8dd0df5603 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,7 +47,7 @@ namespace arm_gemm {
static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
#ifdef __ARM_FEATURE_SVE
-#ifdef V8P6
+#ifdef MMLA_INT8
{
GemmMethod::GEMM_INTERLEAVED,
"interleaved_s8s32_mmla_3VLx8",
@@ -85,7 +85,7 @@ static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
[](const GemmArgs &args) { return new GemmInterleaved<interleaved_s8s32_dot_3VLx8, int8_t, int32_t>(args); }
},
#endif
-#ifdef V8P6
+#ifdef MMLA_INT8
{
GemmMethod::GEMM_INTERLEAVED,
"interleaved_s8s32_mmla_12x8",
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 4897bedf47..f572f7940b 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -386,8 +386,8 @@ public:
// not multi for now (as this would cause problems with the buffer
// manager).
ndrange_t get_window_size() const override {
- auto m_win_size = (_Mround / strategy::out_height()) * _nbatches;
- return { m_win_size, 1u, 1u, 1u, 1u, 1u };
+ // _Mround is a multiple of out_height by definition.
+ return { (_Mround / strategy::out_height()) * _nbatches };
}
// set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
@@ -399,7 +399,10 @@ public:
}
// Execute
- void execute_1d(unsigned int start, unsigned int end, int threadid) {
+ void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override {
+ const auto start = work_range.get_position(0);
+ const auto end = work_range.get_position_end(0);
+
if (_pretransposed) {
execute_internal<true>(start, end, threadid);
} else {
@@ -407,16 +410,6 @@ public:
}
}
- //Execute
- void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
- UNUSED(thread_locator);
-
- const auto start = work_range.get_position(0);
- const auto stop = work_range.get_position_end(0);
-
- execute_1d(start, stop, threadid);
- }
-
// Interface implementation - working space
size_t get_working_size() const override {
// In all cases, we need one A buffer plus a C buffer per thread.
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp
index 53f8e6c938..376d19cc65 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp
@@ -170,9 +170,7 @@ class GemmInterleaved2d : public GemmCommon<To, Tr> {
return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
}
- void execute_transpose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) {
- UNUSED(mthreadid);
-
+ void execute_transpose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int, int nthreadid) {
strategy strat(_ci);
/* Translate 'start' and 'end' into a position within the batches and rows. */
@@ -382,7 +380,7 @@ public:
unsigned m = (_Mround / strategy::out_height()) * _nbatches;
unsigned n = _Nround_div;
- return { m, n, 1u, 1u, 1u, 1u };
+ return { m, n };
}
// set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
@@ -395,8 +393,6 @@ public:
* This particular GEMM implementation can only be broken up over the M & N
* dimensions, we inform the frame work of this limitation via the get_window_size function
*/
- assert(ndrange_popcount(work_range) <= 2);
-
const auto m_start = work_range.get_position(0);
const auto n_start = work_range.get_position(1);
const auto m_size = work_range.get_size(0);
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
index eff4877198..38fb26370c 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
@@ -173,16 +173,13 @@ class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> {
// Internal execute function.
// This supports both the "pretransposed" and "standard" interfaces via the template parameter.
- void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) {
+ void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int, int) {
/* Make sure we've been set up correctly. */
assert(_B_transposed);
assert(_working_space);
assert(this->_Aptr);
assert(this->_Cptr);
- UNUSED(mthreadid);
- UNUSED(nthreadid);
-
#ifdef CYCLE_PROFILING
profiler prof;
#endif
@@ -389,7 +386,7 @@ public:
unsigned m = (_Mround / strategy::out_height()) * _nbatches;
unsigned n = _Nround_div;
- return { m, n, 1u, 1u, 1u, 1u };
+ return { m, n };
}
// set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
@@ -401,8 +398,6 @@ public:
/* This particular GEMM implementation can only be broken up over the M & N
* dimensions, we inform the frame work of this limitation via the get_window_size function
*/
- assert(ndrange_popcount(work_range) <= 2);
-
const auto m_start = work_range.get_position(0);
const auto n_start = work_range.get_position(1);
const auto m_size = work_range.get_size(0);
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
index c2f742b5cf..cddbd51e32 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
@@ -88,7 +88,7 @@ public:
// Window is amount per multi multiplied by total number of multis.
ndrange_t get_window_size() const override {
- return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u };
+ return { _window_range.total_size() };
}
// Native GEMMs can always be dynamically scheduled (whether requested or not)
@@ -97,7 +97,7 @@ public:
}
// Actually execute the GEMM.
- void execute_1d(unsigned int start, unsigned int end, int) {
+ void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
#ifdef CYCLE_PROFILING
profiler prof;
#endif
@@ -106,7 +106,7 @@ public:
static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
- auto p = _window_range.iterator(start, end);
+ auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
if (p.done()) {
return;
@@ -139,16 +139,6 @@ public:
}
} while (p.next_dim1());
}
-
- //Execute
- void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
- UNUSED(thread_locator);
-
- const auto start = work_range.get_position(0);
- const auto stop = work_range.get_position_end(0);
-
- execute_1d(start, stop, threadid);
- }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index 85a8a6720a..5e06443e19 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index 430d35e06d..d74f335e38 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,7 +47,7 @@ namespace arm_gemm {
static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
#ifdef __ARM_FEATURE_SVE
-#ifdef V8P6
+#ifdef MMLA_INT8
{
GemmMethod::GEMM_INTERLEAVED,
"interleaved_u8u32_mmla_3VLx8",
@@ -85,7 +85,7 @@ static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
[](const GemmArgs &args) { return new GemmInterleaved<interleaved_u8u32_dot_3VLx8, uint8_t, uint32_t>(args); }
},
#endif
-#ifdef V8P6
+#ifdef MMLA_INT8
{
GemmMethod::GEMM_INTERLEAVED,
"interleaved_u8u32_mmla_12x8",
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index 939788ed8d..12216009d2 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,17 +45,15 @@ public:
_subgemm = gemm<To,Tr>(newargs);
}
- void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
+ void set_arrays(const To *A, const int, const int A_batch_stride, const int A_multi_stride,
const To *B, const int ldb, const int B_multi_stride,
- Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
+ Tr *C, const int, const int C_batch_stride, const int C_multi_stride,
const Tr *bias, const int bias_multi_stride) override {
/* A and C's batch stride becomes their new row stride. New batch stride is 0 as nbatches for subgemm is always 1. */
_subgemm->set_arrays(A, A_batch_stride, 0, A_multi_stride,
B, ldb, B_multi_stride,
C, C_batch_stride, 0, C_multi_stride,
bias, bias_multi_stride);
- UNUSED(lda);
- UNUSED(ldc);
}
ndrange_t get_window_size() const override {
@@ -66,7 +64,7 @@ public:
_subgemm->set_nthreads(nthreads);
}
- void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+ void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) override {
_subgemm->execute(work_range, thread_locator, threadid);
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
index 190f4aa643..9209d48bd9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -73,16 +73,19 @@ public:
// Window is number of out_width blocks times number of multis.
ndrange_t get_window_size() const override {
- return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u };
+ return { iceildiv(_Nsize, strategy::out_width()) * _nmultis };
}
// Actually execute the GEMV.
- void execute_1d(unsigned int start, unsigned int end, int) {
+ void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
#ifdef CYCLE_PROFILING
profiler prof;
#endif
strategy strat(_ci);
+ const auto start = work_range.get_position(0);
+ const auto end = work_range.get_position_end(0);
+
const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width());
const unsigned int multi_0 = start / window_per_multi;
const unsigned int multi_end = end / window_per_multi;
@@ -127,17 +130,6 @@ public:
}
}
}
-
- // Execute
- void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
- UNUSED(thread_locator);
-
- const auto start = work_range.get_position(0);
- const auto size = work_range.get_size(0);
- const auto stop = start + size;
-
- execute_1d(start, stop, threadid);
- }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index 7f52ac5a14..945e363839 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,16 +87,19 @@ public:
// Window is number of out_width blocks, times number of multis.
ndrange_t get_window_size() const override {
- return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u };
+ return { iceildiv(_Nsize, strategy::out_width()) * _nmultis };
}
// Actually execute the GEMV.
- void execute_1d(unsigned int start, unsigned int end, int) {
+ void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
#ifdef CYCLE_PROFILING
profiler prof;
#endif
strategy strat(_ci);
+ const auto start = work_range.get_position(0);
+ const auto end = work_range.get_position_end(0);
+
/* Break the window values down into multis of interest... */
const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width());
const unsigned int multi_0 = start / window_per_multi;
@@ -145,17 +148,6 @@ public:
}
}
- // Execute
- void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
- UNUSED(thread_locator);
-
- const auto start = work_range.get_position(0);
- const auto size = work_range.get_size(0);
- const auto stop = start + size;
-
- execute_1d(start, stop, threadid);
- }
-
/* Pretransposed interface implementation */
bool B_is_pretransposed() const override {
return true;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
index 8700c42f5d..0f0e5a7ed4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -65,7 +65,7 @@ public:
kern_type kernel = a64_gemm_s16_asimd_12x8;
- gemm_s16_12x8(const CPUInfo *ci) { UNUSED(ci); }
+ gemm_s16_12x8(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
index cc6c583b33..e5b295b640 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
@@ -34,6 +34,7 @@ namespace arm_gemm {
// Load the actual kernel
void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_12x8_x1(const int8_t *, const int8_t *, int32_t *, int, int, int);
class gemm_s8_12x8 {
public:
@@ -65,6 +66,8 @@ public:
if (mod == CPUModel::A55r1) {
kernel = a64_gemm_s8_12x8_a55r1;
+ } else if (mod == CPUModel::X1) {
+ kernel = a64_gemm_s8_12x8_x1;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp
new file mode 100644
index 0000000000..446fcf8707
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void a64_gemm_s8_12x8_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const int8_t *a_ptr = Apanel;
+ int32_t *c_ptr = Cpanel;
+ // We divide K by 4 because the sdot instruction processes 4 elements at a time.
+ const int W = K/4;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ const int oddk = (W & 1);
+ const int init_value_k = ((W+1)/2) - 1;
+ for (int yb=0; yb<ablocks; yb++) {
+ const int8_t *a_ptr0 = a_ptr;
+ const int8_t *b_ptr = Bpanel;
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ int k = init_value_k;
+ register uint8x16_t a0 asm("v0");
+ register uint8x16_t a1 asm("v1");
+ register uint8x16_t b0 asm("v2");
+ register uint8x16_t b1 asm("v3");
+ register uint8x16_t b2 asm("v4");
+
+ __asm __volatile (
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v13.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v14.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]")
+ "movi v16.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v17.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v19.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v20.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #256]")
+ "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ // Loop proper
+ "1:\n"
+ ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ ASM_PREFETCH("[%[a_ptr], #320]")
+ ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #448]")
+ ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "ldr %q[a0], [%[a_ptr], #32]\n"
+ ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[a1], [%[a_ptr], #48]\n"
+
+ ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+ ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #512]")
+ ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "subs %w[k], %w[k], #1\n"
+ ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "bne 1b\n"
+
+ // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "ldr %q[a0], [%[a_ptr], #-32]\n"
+ ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[a1], [%[a_ptr], #-16]\n"
+
+ ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+
+ // Common tail
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [a0] "+w" (a0), [a1] "+w" (a1),
+ [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+ : [oddk] "r" (oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+ );
+
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index fb21bfc863..256acc4c65 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -59,7 +59,7 @@ public:
kern_type kernel=a64_gemm_s8_4x4;
- gemm_s8_4x4(const CPUInfo *ci) { UNUSED(ci); }
+ gemm_s8_4x4(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
index 971b027c44..b86204043c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -65,7 +65,7 @@ public:
kern_type kernel = a64_gemm_u16_asimd_12x8;
- gemm_u16_12x8(const CPUInfo *ci) { UNUSED(ci); }
+ gemm_u16_12x8(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
index a67e2d6c84..52ce5d26d9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
@@ -32,6 +32,7 @@ namespace arm_gemm {
// Load the actual kernel
void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_12x8_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
class gemm_u8_12x8 {
public:
@@ -73,6 +74,8 @@ public:
if (mod == CPUModel::A55r1) {
kernel = a64_gemm_u8_12x8_a55r1;
+ } else if (mod == CPUModel::X1) {
+ kernel = a64_gemm_u8_12x8_x1;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp
new file mode 100644
index 0000000000..7fac67354f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void a64_gemm_u8_12x8_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const uint8_t *a_ptr = Apanel;
+ uint32_t *c_ptr = Cpanel;
+ // We divide K by 4 because the udot instruction processes 4 elements at a time.
+ const int W = K/4;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ const int oddk = (W & 1);
+ const int init_value_k = ((W+1)/2) - 1;
+ for (int yb=0; yb<ablocks; yb++) {
+ const uint8_t *a_ptr0 = a_ptr;
+ const uint8_t *b_ptr = Bpanel;
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ int k = init_value_k;
+ register uint8x16_t a0 asm("v0");
+ register uint8x16_t a1 asm("v1");
+ register uint8x16_t b0 asm("v2");
+ register uint8x16_t b1 asm("v3");
+ register uint8x16_t b2 asm("v4");
+
+ __asm __volatile (
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v13.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v14.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]")
+ "movi v16.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v17.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v19.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v20.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #256]")
+ "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ // Loop proper
+ "1:\n"
+ ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ ASM_PREFETCH("[%[a_ptr], #320]")
+ ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #448]")
+ ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "ldr %q[a0], [%[a_ptr], #32]\n"
+ ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[a1], [%[a_ptr], #48]\n"
+
+ ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+ ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #512]")
+ ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "subs %w[k], %w[k], #1\n"
+ ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "bne 1b\n"
+
+ // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "ldr %q[a0], [%[a_ptr], #-32]\n"
+ ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[a1], [%[a_ptr], #-16]\n"
+
+ ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+
+ // Common tail
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+
+ :
+ [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [a0] "+w" (a0), [a1] "+w" (a1),
+ [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+ : [oddk] "r" (oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+ );
+
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index 8bde3a6943..134007b74c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -67,9 +67,7 @@ public:
kern_type kernel = a64_gemm_u8_4x4;
- gemm_u8_4x4(const CPUInfo *ci) {
- UNUSED(ci);
- }
+ gemm_u8_4x4(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
index 8e17aa6663..79cae6002a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,6 +32,7 @@ namespace arm_gemm {
// Actual kernel implementations
void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_24x8_x1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
// 24x8 HGEMM "strategy" class. Describes the kernel properties.
//
@@ -68,6 +69,8 @@ public:
if (model == CPUModel::A55r1) {
kernel = a64_hgemm_asimd_24x8_a55r1;
+ } else if (model == CPUModel::X1) {
+ kernel = a64_hgemm_asimd_24x8_x1;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp
new file mode 100644
index 0000000000..3bb8334126
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported.
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 24x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm {
+
+void a64_hgemm_asimd_24x8_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+ const __fp16 *a_ptr = Apanel;
+ __fp16 *c_ptr = Cpanel;
+
+ for (int yb=0; yb<ablocks; yb++) {
+ const __fp16 *a_ptr0 = a_ptr;
+ const __fp16 *b_ptr = Bpanel;
+
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K+1)/2) - 1;
+
+ register float16x8_t a0 asm("v0");
+ register float16x8_t a0a asm("v1");
+ register float16x8_t b0 asm("v2");
+ register float16x8_t b1 asm("v3");
+ register float16x8_t b2 asm("v4");
+
+ __asm __volatile (
+ // Enable FP16 instruction support (but only if it's not already on).
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ ".arch armv8.2-a+fp16\n"
+#endif
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.8h, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.8h, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.8h, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v11.8h, #0x0\n"
+ "movi v12.8h, #0x0\n"
+ "movi v13.8h, #0x0\n"
+ "movi v14.8h, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v15.8h, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v16.8h, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v17.8h, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v18.8h, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v19.8h, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v20.8h, #0x0\n"
+ "movi v21.8h, #0x0\n"
+ "movi v22.8h, #0x0\n"
+ "movi v23.8h, #0x0\n"
+ "movi v24.8h, #0x0\n"
+ "movi v25.8h, #0x0\n"
+ "movi v26.8h, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v28.8h, #0x0\n"
+ "movi v29.8h, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v31.8h, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ ASM_PREFETCH("[%[a_ptr], #128]")
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+ "ldr %q[b1], [%[b_ptr], #-32]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #288]")
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #16]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+ "ldr %q[b2], [%[b_ptr], #-16]\n"
+
+ "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
+ "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
+ "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
+ "fmla v12.8h, %[b0].8h, %[a0a].h[4]\n"
+ "fmla v13.8h, %[b0].8h, %[a0a].h[5]\n"
+ "fmla v14.8h, %[b0].8h, %[a0a].h[6]\n"
+ "fmla v15.8h, %[b0].8h, %[a0a].h[7]\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #352]")
+ "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
+ "fmla v20.8h, %[b1].8h, %[a0a].h[4]\n"
+ "fmla v21.8h, %[b1].8h, %[a0a].h[5]\n"
+ "fmla v22.8h, %[b1].8h, %[a0a].h[6]\n"
+ "fmla v23.8h, %[b1].8h, %[a0a].h[7]\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "fmla v28.8h, %[b2].8h, %[a0a].h[4]\n"
+ "fmla v29.8h, %[b2].8h, %[a0a].h[5]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v30.8h, %[b2].8h, %[a0a].h[6]\n"
+ "fmla v31.8h, %[b2].8h, %[a0a].h[7]\n"
+
+ "bne 1b\n"
+ "4:\n"
+
+ // Jump to odd tail if necessary.
+ "cbnz %w[oddk], 2f\n"
+
+ // Even tail.
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+ "ldr %q[b1], [%[b_ptr], #-32]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #-16]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+ "ldr %q[b2], [%[b_ptr], #-16]\n"
+
+ "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
+ "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
+ "str q8, [%[c_ptr]]\n"
+ "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
+ "str q16, [%[c_ptr], #16]\n"
+
+ "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
+ "str q17, [%[c_ptr], #64]\n"
+
+ "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
+ "str q18, [%[c_ptr], #112]\n"
+
+ "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
+ "str q19, [%[c_ptr], #160]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a0a].h[4]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v20.8h, %[b1].8h, %[a0a].h[4]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "fmla v28.8h, %[b2].8h, %[a0a].h[4]\n"
+ "str q20, [%[c_ptr], #208]\n"
+
+ "fmla v13.8h, %[b0].8h, %[a0a].h[5]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v21.8h, %[b1].8h, %[a0a].h[5]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "fmla v29.8h, %[b2].8h, %[a0a].h[5]\n"
+ "str q21, [%[c_ptr], #256]\n"
+
+ "fmla v14.8h, %[b0].8h, %[a0a].h[6]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v22.8h, %[b1].8h, %[a0a].h[6]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "fmla v30.8h, %[b2].8h, %[a0a].h[6]\n"
+ "str q22, [%[c_ptr], #304]\n"
+
+ "fmla v15.8h, %[b0].8h, %[a0a].h[7]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v23.8h, %[b1].8h, %[a0a].h[7]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "fmla v31.8h, %[b2].8h, %[a0a].h[7]\n"
+ "b 3f\n"
+
+ // Odd tail
+ "2:\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "add %[a_ptr], %[a_ptr], #16\n"
+ "str q8, [%[c_ptr]]\n"
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "str q16, [%[c_ptr], #16]\n"
+
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "str q17, [%[c_ptr], #64]\n"
+
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "str q18, [%[c_ptr], #112]\n"
+
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "str q19, [%[c_ptr], #160]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "str q20, [%[c_ptr], #208]\n"
+
+ "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "str q21, [%[c_ptr], #256]\n"
+
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "str q22, [%[c_ptr], #304]\n"
+
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [a0] "+w" (a0), [a0a] "+w" (a0a),
+ [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+ : [oddk] "r" (oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+ );
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
index 1ce934d413..5e5b6bd4c8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,6 +34,7 @@ namespace arm_gemm
// Actual kernel implementations
void a64_hybrid_fp32_mla_16x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
void a64_hybrid_fp32_mla_16x4_a55(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_hybrid_fp32_mla_16x4_x1(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
class hybrid_fp32_mla_16x4
{
@@ -83,6 +84,8 @@ public:
{
if (ci->get_cpu_model() == CPUModel::A55r1) {
kernel = a64_hybrid_fp32_mla_16x4_a55;
+ } else if (ci->get_cpu_model() == CPUModel::X1) {
+ kernel = a64_hybrid_fp32_mla_16x4_x1;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
index 5bce632bc4..1b828ee503 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,12 +61,23 @@ void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float
break;
}
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const float * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(float);
float *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
long loops = loops_count;
@@ -90,7 +101,7 @@ void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float
}
const float *biasptr = bias ? bias+x0 : nullbias;
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"temploadreg0 .req X0\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
index 03f65889ea..43ff3a98dc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,12 +61,23 @@ void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C,
break;
}
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const float * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(float);
float *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
long loops = loops_count;
@@ -90,7 +101,7 @@ void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C,
}
const float *biasptr = bias ? bias+x0 : nullbias;
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"cbnz %[append], 1f\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp
new file mode 100644
index 0000000000..f4fba227d6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp
@@ -0,0 +1,1810 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_16x4_x1(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
+ const int K_stride = K;
+ const long loops_count = ((K + 4) / 8) - 1;
+ K -= loops_count * 8;
+ const long regs_count = (K / 4) - 1;
+ K -= (regs_count + 1) * 4;
+ const long blocks_count = K / 1;
+ float nullbias[16];
+ if (!append && !bias) {
+ memset(nullbias, 0, (16 * sizeof(float)));
+ }
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ const float * const minptr = &minval;
+ const float * const maxptr = &maxval;
+
+ switch(act.type)
+ {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ minval = 0.0f;
+ break;
+ }
+
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
+ const float * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(float);
+
+ float *c_ptr0 = C + (y * ldc);
+
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
+ for (int x0=0; x0<N; x0+=16ul) {
+ const long width = std::min((unsigned long)N-x0, 16ul);
+ long loops = loops_count;
+ long regs = regs_count;
+ long blocks = blocks_count;
+ const float *a_ptr0 = a_ptr0_base;
+ const float *b_ptr0 = B + (K_stride * x0);
+ const bool use_result_buffer = (width < 16);
+ float result_buffer[64];
+ const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
+ float *c_ptr_real = c_ptr0;
+ if (use_result_buffer && append) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+ }
+ }
+ }
+ if (use_result_buffer) {
+ c_ptr0 = result_buffer;
+ }
+ const float *biasptr = bias ? bias+x0 : nullbias;
+
+ switch(rows_to_compute) {
+ case 1:
+ __asm __volatile (
+ "cbnz %[append], 1f\n"
+ "ldr q16, [%[biasptr]]\n"
+ "ldr q17, [%[biasptr], #0x10]\n"
+ "ldr q18, [%[biasptr], #0x20]\n"
+ "ldr q19, [%[biasptr], #0x30]\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v17.4s, v9.4s, v0.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v18.4s, v10.4s, v0.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v16.4s, v8.4s, v0.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v0.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v4.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v4.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v4.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v4.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v0.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v0.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v4.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v4.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+ "fmla v16.4s, v8.4s, v4.s[3]\n"
+ "fmla v17.4s, v9.4s, v4.s[3]\n"
+ "fmla v18.4s, v10.4s, v4.s[3]\n"
+ "fmla v19.4s, v11.4s, v4.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v0.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[3]\n"
+ "fmla v17.4s, v9.4s, v0.s[3]\n"
+ "fmla v18.4s, v10.4s, v0.s[3]\n"
+ "fmla v19.4s, v11.4s, v0.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "ld1r {v14.4s}, [%[minptr]]\n"
+ "ld1r {v15.4s}, [%[maxptr]]\n"
+ "fmax v16.4s, v16.4s, v14.4s\n"
+ "fmax v17.4s, v17.4s, v14.4s\n"
+ "fmax v18.4s, v18.4s, v14.4s\n"
+ "fmax v19.4s, v19.4s, v14.4s\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbnz %[append], 1f\n"
+ "ldr q16, [%[biasptr]]\n"
+ "ldr q17, [%[biasptr], #0x10]\n"
+ "ldr q18, [%[biasptr], #0x20]\n"
+ "ldr q19, [%[biasptr], #0x30]\n"
+ "mov v20.16b, v16.16b\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mov v21.16b, v17.16b\n"
+ "ldr q1, [a_ptr1]\n"
+ "mov v22.16b, v18.16b\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mov v23.16b, v19.16b\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "ldr q20, [c_ptr1]\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q1, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v20.4s, v8.4s, v1.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v21.4s, v9.4s, v1.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v0.s[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v22.4s, v10.4s, v1.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "fmla v23.4s, v11.4s, v1.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[3]\n"
+ "fmla v20.4s, v8.4s, v1.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[3]\n"
+ "fmla v21.4s, v9.4s, v1.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v0.s[3]\n"
+ "fmla v22.4s, v10.4s, v1.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ "fmla v23.4s, v11.4s, v1.s[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v4.s[1]\n"
+ "fmla v20.4s, v8.4s, v5.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[1]\n"
+ "fmla v21.4s, v9.4s, v5.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v4.s[1]\n"
+ "fmla v22.4s, v10.4s, v5.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[1]\n"
+ "fmla v23.4s, v11.4s, v5.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v4.s[3]\n"
+ "fmla v20.4s, v8.4s, v5.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[3]\n"
+ "fmla v21.4s, v9.4s, v5.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v4.s[3]\n"
+ "fmla v22.4s, v10.4s, v5.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[3]\n"
+ "fmla v23.4s, v11.4s, v5.s[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "fmla v20.4s, v8.4s, v1.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[1]\n"
+ "fmla v21.4s, v9.4s, v1.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v0.s[1]\n"
+ "fmla v22.4s, v10.4s, v1.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[1]\n"
+ "fmla v23.4s, v11.4s, v1.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[3]\n"
+ "fmla v20.4s, v8.4s, v1.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[3]\n"
+ "fmla v21.4s, v9.4s, v1.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v0.s[3]\n"
+ "fmla v22.4s, v10.4s, v1.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[3]\n"
+ "fmla v23.4s, v11.4s, v1.s[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v4.s[1]\n"
+ "fmla v20.4s, v8.4s, v5.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[1]\n"
+ "fmla v21.4s, v9.4s, v5.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v4.s[1]\n"
+ "fmla v22.4s, v10.4s, v5.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[1]\n"
+ "fmla v23.4s, v11.4s, v5.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v4.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+ "fmla v20.4s, v8.4s, v5.s[3]\n"
+ "fmla v17.4s, v9.4s, v4.s[3]\n"
+ "fmla v21.4s, v9.4s, v5.s[3]\n"
+ "fmla v18.4s, v10.4s, v4.s[3]\n"
+ "fmla v22.4s, v10.4s, v5.s[3]\n"
+ "fmla v19.4s, v11.4s, v4.s[3]\n"
+ "fmla v23.4s, v11.4s, v5.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "fmla v20.4s, v8.4s, v1.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[1]\n"
+ "fmla v21.4s, v9.4s, v1.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v0.s[1]\n"
+ "fmla v22.4s, v10.4s, v1.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[1]\n"
+ "fmla v23.4s, v11.4s, v1.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+ "fmla v20.4s, v8.4s, v1.s[3]\n"
+ "fmla v17.4s, v9.4s, v0.s[3]\n"
+ "fmla v21.4s, v9.4s, v1.s[3]\n"
+ "fmla v18.4s, v10.4s, v0.s[3]\n"
+ "fmla v22.4s, v10.4s, v1.s[3]\n"
+ "fmla v19.4s, v11.4s, v0.s[3]\n"
+ "fmla v23.4s, v11.4s, v1.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "ld1r {v14.4s}, [%[minptr]]\n"
+ "ld1r {v15.4s}, [%[maxptr]]\n"
+ "fmax v16.4s, v16.4s, v14.4s\n"
+ "fmax v17.4s, v17.4s, v14.4s\n"
+ "fmax v18.4s, v18.4s, v14.4s\n"
+ "fmax v19.4s, v19.4s, v14.4s\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmax v20.4s, v20.4s, v14.4s\n"
+ "fmax v21.4s, v21.4s, v14.4s\n"
+ "fmax v22.4s, v22.4s, v14.4s\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "fmax v23.4s, v23.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v15.4s\n"
+ "fmin v21.4s, v21.4s, v15.4s\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "fmin v22.4s, v22.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v15.4s\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbnz %[append], 1f\n"
+ "ldr q16, [%[biasptr]]\n"
+ "ldr q17, [%[biasptr], #0x10]\n"
+ "ldr q18, [%[biasptr], #0x20]\n"
+ "ldr q19, [%[biasptr], #0x30]\n"
+ "mov v20.16b, v16.16b\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mov v21.16b, v17.16b\n"
+ "ldr q1, [a_ptr1]\n"
+ "mov v22.16b, v18.16b\n"
+ "ldr q2, [a_ptr2]\n"
+ "mov v23.16b, v19.16b\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mov v24.16b, v16.16b\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mov v25.16b, v17.16b\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "mov v26.16b, v18.16b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov v27.16b, v19.16b\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "ldr q20, [c_ptr1]\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "ldr q24, [c_ptr2]\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q1, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q2, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla v20.4s, v8.4s, v1.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "fmla v24.4s, v8.4s, v2.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ "fmla v21.4s, v9.4s, v1.s[1]\n"
+ "fmla v25.4s, v9.4s, v2.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v0.s[1]\n"
+ "fmla v22.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v2.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[1]\n"
+ "fmla v23.4s, v11.4s, v1.s[1]\n"
+ "fmla v27.4s, v11.4s, v2.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[3]\n"
+ "fmla v20.4s, v8.4s, v1.s[3]\n"
+ "fmla v24.4s, v8.4s, v2.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[3]\n"
+ "fmla v21.4s, v9.4s, v1.s[3]\n"
+ "fmla v25.4s, v9.4s, v2.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v0.s[3]\n"
+ "fmla v22.4s, v10.4s, v1.s[3]\n"
+ "fmla v26.4s, v10.4s, v2.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ "fmla v23.4s, v11.4s, v1.s[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ "fmla v27.4s, v11.4s, v2.s[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v4.s[1]\n"
+ "fmla v20.4s, v8.4s, v5.s[1]\n"
+ "fmla v24.4s, v8.4s, v6.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[1]\n"
+ "fmla v21.4s, v9.4s, v5.s[1]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v4.s[1]\n"
+ "fmla v22.4s, v10.4s, v5.s[1]\n"
+ "fmla v26.4s, v10.4s, v6.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[1]\n"
+ "fmla v23.4s, v11.4s, v5.s[1]\n"
+ "fmla v27.4s, v11.4s, v6.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v4.s[3]\n"
+ "fmla v20.4s, v8.4s, v5.s[3]\n"
+ "fmla v24.4s, v8.4s, v6.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[3]\n"
+ "fmla v21.4s, v9.4s, v5.s[3]\n"
+ "fmla v25.4s, v9.4s, v6.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v4.s[3]\n"
+ "fmla v22.4s, v10.4s, v5.s[3]\n"
+ "fmla v26.4s, v10.4s, v6.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[3]\n"
+ "fmla v23.4s, v11.4s, v5.s[3]\n"
+ "fmla v27.4s, v11.4s, v6.s[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "fmla v20.4s, v8.4s, v1.s[1]\n"
+ "fmla v24.4s, v8.4s, v2.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[1]\n"
+ "fmla v21.4s, v9.4s, v1.s[1]\n"
+ "fmla v25.4s, v9.4s, v2.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v0.s[1]\n"
+ "fmla v22.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v2.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[1]\n"
+ "fmla v23.4s, v11.4s, v1.s[1]\n"
+ "fmla v27.4s, v11.4s, v2.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[3]\n"
+ "fmla v20.4s, v8.4s, v1.s[3]\n"
+ "fmla v24.4s, v8.4s, v2.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[3]\n"
+ "fmla v21.4s, v9.4s, v1.s[3]\n"
+ "fmla v25.4s, v9.4s, v2.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v0.s[3]\n"
+ "fmla v22.4s, v10.4s, v1.s[3]\n"
+ "fmla v26.4s, v10.4s, v2.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[3]\n"
+ "fmla v23.4s, v11.4s, v1.s[3]\n"
+ "fmla v27.4s, v11.4s, v2.s[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v4.s[1]\n"
+ "fmla v20.4s, v8.4s, v5.s[1]\n"
+ "fmla v24.4s, v8.4s, v6.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[1]\n"
+ "fmla v21.4s, v9.4s, v5.s[1]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v4.s[1]\n"
+ "fmla v22.4s, v10.4s, v5.s[1]\n"
+ "fmla v26.4s, v10.4s, v6.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[1]\n"
+ "fmla v23.4s, v11.4s, v5.s[1]\n"
+ "fmla v27.4s, v11.4s, v6.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v4.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+ "fmla v20.4s, v8.4s, v5.s[3]\n"
+ "fmla v24.4s, v8.4s, v6.s[3]\n"
+ "fmla v17.4s, v9.4s, v4.s[3]\n"
+ "fmla v21.4s, v9.4s, v5.s[3]\n"
+ "fmla v25.4s, v9.4s, v6.s[3]\n"
+ "fmla v18.4s, v10.4s, v4.s[3]\n"
+ "fmla v22.4s, v10.4s, v5.s[3]\n"
+ "fmla v26.4s, v10.4s, v6.s[3]\n"
+ "fmla v19.4s, v11.4s, v4.s[3]\n"
+ "fmla v23.4s, v11.4s, v5.s[3]\n"
+ "fmla v27.4s, v11.4s, v6.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "fmla v20.4s, v8.4s, v1.s[1]\n"
+ "fmla v24.4s, v8.4s, v2.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[1]\n"
+ "fmla v21.4s, v9.4s, v1.s[1]\n"
+ "fmla v25.4s, v9.4s, v2.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v0.s[1]\n"
+ "fmla v22.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v2.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[1]\n"
+ "fmla v23.4s, v11.4s, v1.s[1]\n"
+ "fmla v27.4s, v11.4s, v2.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+ "fmla v20.4s, v8.4s, v1.s[3]\n"
+ "fmla v24.4s, v8.4s, v2.s[3]\n"
+ "fmla v17.4s, v9.4s, v0.s[3]\n"
+ "fmla v21.4s, v9.4s, v1.s[3]\n"
+ "fmla v25.4s, v9.4s, v2.s[3]\n"
+ "fmla v18.4s, v10.4s, v0.s[3]\n"
+ "fmla v22.4s, v10.4s, v1.s[3]\n"
+ "fmla v26.4s, v10.4s, v2.s[3]\n"
+ "fmla v19.4s, v11.4s, v0.s[3]\n"
+ "fmla v23.4s, v11.4s, v1.s[3]\n"
+ "fmla v27.4s, v11.4s, v2.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "ld1r {v14.4s}, [%[minptr]]\n"
+ "ld1r {v15.4s}, [%[maxptr]]\n"
+ "fmax v16.4s, v16.4s, v14.4s\n"
+ "fmax v17.4s, v17.4s, v14.4s\n"
+ "fmax v18.4s, v18.4s, v14.4s\n"
+ "fmax v19.4s, v19.4s, v14.4s\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmax v20.4s, v20.4s, v14.4s\n"
+ "fmax v21.4s, v21.4s, v14.4s\n"
+ "fmax v22.4s, v22.4s, v14.4s\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "fmax v23.4s, v23.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v15.4s\n"
+ "fmin v21.4s, v21.4s, v15.4s\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "fmin v22.4s, v22.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v15.4s\n"
+ "fmax v24.4s, v24.4s, v14.4s\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "fmax v25.4s, v25.4s, v14.4s\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "fmax v26.4s, v26.4s, v14.4s\n"
+ "str q20, [c_ptr1]\n"
+ "fmin v24.4s, v24.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v14.4s\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "fmin v26.4s, v26.4s, v15.4s\n"
+ "fmin v27.4s, v27.4s, v15.4s\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbnz %[append], 1f\n"
+ "ldr q16, [%[biasptr]]\n"
+ "ldr q17, [%[biasptr], #0x10]\n"
+ "ldr q18, [%[biasptr], #0x20]\n"
+ "ldr q19, [%[biasptr], #0x30]\n"
+ "mov v20.16b, v16.16b\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mov v21.16b, v17.16b\n"
+ "ldr q1, [a_ptr1]\n"
+ "mov v22.16b, v18.16b\n"
+ "ldr q2, [a_ptr2]\n"
+ "mov v23.16b, v19.16b\n"
+ "ldr q3, [a_ptr3]\n"
+ "mov v24.16b, v16.16b\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mov v25.16b, v17.16b\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mov v26.16b, v18.16b\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "mov v27.16b, v19.16b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov v28.16b, v16.16b\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov v29.16b, v17.16b\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov v30.16b, v18.16b\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov v31.16b, v19.16b\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "ldr q20, [c_ptr1]\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "ldr q24, [c_ptr2]\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "ldr q28, [c_ptr3]\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q1, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q2, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q3, [a_ptr3]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ "fmla v20.4s, v8.4s, v1.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ "fmla v24.4s, v8.4s, v2.s[1]\n"
+ "fmla v28.4s, v8.4s, v3.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[1]\n"
+ "fmla v21.4s, v9.4s, v1.s[1]\n"
+ "fmla v25.4s, v9.4s, v2.s[1]\n"
+ "fmla v29.4s, v9.4s, v3.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v0.s[1]\n"
+ "fmla v22.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v2.s[1]\n"
+ "fmla v30.4s, v10.4s, v3.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[1]\n"
+ "fmla v23.4s, v11.4s, v1.s[1]\n"
+ "fmla v27.4s, v11.4s, v2.s[1]\n"
+ "fmla v31.4s, v11.4s, v3.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[3]\n"
+ "fmla v20.4s, v8.4s, v1.s[3]\n"
+ "fmla v24.4s, v8.4s, v2.s[3]\n"
+ "fmla v28.4s, v8.4s, v3.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[3]\n"
+ "fmla v21.4s, v9.4s, v1.s[3]\n"
+ "fmla v25.4s, v9.4s, v2.s[3]\n"
+ "fmla v29.4s, v9.4s, v3.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v0.s[3]\n"
+ "fmla v22.4s, v10.4s, v1.s[3]\n"
+ "fmla v26.4s, v10.4s, v2.s[3]\n"
+ "fmla v30.4s, v10.4s, v3.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ "fmla v23.4s, v11.4s, v1.s[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ "fmla v27.4s, v11.4s, v2.s[3]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ "fmla v31.4s, v11.4s, v3.s[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q3, [a_ptr3, #-0x10]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "fmla v28.4s, v8.4s, v7.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "fmla v29.4s, v9.4s, v7.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "fmla v30.4s, v10.4s, v7.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "fmla v31.4s, v11.4s, v7.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v4.s[1]\n"
+ "fmla v20.4s, v8.4s, v5.s[1]\n"
+ "fmla v24.4s, v8.4s, v6.s[1]\n"
+ "fmla v28.4s, v8.4s, v7.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[1]\n"
+ "fmla v21.4s, v9.4s, v5.s[1]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmla v29.4s, v9.4s, v7.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v4.s[1]\n"
+ "fmla v22.4s, v10.4s, v5.s[1]\n"
+ "fmla v26.4s, v10.4s, v6.s[1]\n"
+ "fmla v30.4s, v10.4s, v7.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[1]\n"
+ "fmla v23.4s, v11.4s, v5.s[1]\n"
+ "fmla v27.4s, v11.4s, v6.s[1]\n"
+ "fmla v31.4s, v11.4s, v7.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "fmla v28.4s, v8.4s, v7.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "fmla v29.4s, v9.4s, v7.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "fmla v30.4s, v10.4s, v7.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "fmla v31.4s, v11.4s, v7.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v4.s[3]\n"
+ "fmla v20.4s, v8.4s, v5.s[3]\n"
+ "fmla v24.4s, v8.4s, v6.s[3]\n"
+ "fmla v28.4s, v8.4s, v7.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[3]\n"
+ "fmla v21.4s, v9.4s, v5.s[3]\n"
+ "fmla v25.4s, v9.4s, v6.s[3]\n"
+ "fmla v29.4s, v9.4s, v7.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v4.s[3]\n"
+ "fmla v22.4s, v10.4s, v5.s[3]\n"
+ "fmla v26.4s, v10.4s, v6.s[3]\n"
+ "fmla v30.4s, v10.4s, v7.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[3]\n"
+ "fmla v23.4s, v11.4s, v5.s[3]\n"
+ "fmla v27.4s, v11.4s, v6.s[3]\n"
+ "fmla v31.4s, v11.4s, v7.s[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "fmla v20.4s, v8.4s, v1.s[1]\n"
+ "fmla v24.4s, v8.4s, v2.s[1]\n"
+ "fmla v28.4s, v8.4s, v3.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[1]\n"
+ "fmla v21.4s, v9.4s, v1.s[1]\n"
+ "fmla v25.4s, v9.4s, v2.s[1]\n"
+ "fmla v29.4s, v9.4s, v3.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v0.s[1]\n"
+ "fmla v22.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v2.s[1]\n"
+ "fmla v30.4s, v10.4s, v3.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[1]\n"
+ "fmla v23.4s, v11.4s, v1.s[1]\n"
+ "fmla v27.4s, v11.4s, v2.s[1]\n"
+ "fmla v31.4s, v11.4s, v3.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[3]\n"
+ "fmla v20.4s, v8.4s, v1.s[3]\n"
+ "fmla v24.4s, v8.4s, v2.s[3]\n"
+ "fmla v28.4s, v8.4s, v3.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[3]\n"
+ "fmla v21.4s, v9.4s, v1.s[3]\n"
+ "fmla v25.4s, v9.4s, v2.s[3]\n"
+ "fmla v29.4s, v9.4s, v3.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v0.s[3]\n"
+ "fmla v22.4s, v10.4s, v1.s[3]\n"
+ "fmla v26.4s, v10.4s, v2.s[3]\n"
+ "fmla v30.4s, v10.4s, v3.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[3]\n"
+ "fmla v23.4s, v11.4s, v1.s[3]\n"
+ "fmla v27.4s, v11.4s, v2.s[3]\n"
+ "fmla v31.4s, v11.4s, v3.s[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "fmla v28.4s, v8.4s, v7.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "fmla v29.4s, v9.4s, v7.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "fmla v30.4s, v10.4s, v7.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "fmla v31.4s, v11.4s, v7.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v4.s[1]\n"
+ "fmla v20.4s, v8.4s, v5.s[1]\n"
+ "fmla v24.4s, v8.4s, v6.s[1]\n"
+ "fmla v28.4s, v8.4s, v7.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[1]\n"
+ "fmla v21.4s, v9.4s, v5.s[1]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmla v29.4s, v9.4s, v7.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v4.s[1]\n"
+ "fmla v22.4s, v10.4s, v5.s[1]\n"
+ "fmla v26.4s, v10.4s, v6.s[1]\n"
+ "fmla v30.4s, v10.4s, v7.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[1]\n"
+ "fmla v23.4s, v11.4s, v5.s[1]\n"
+ "fmla v27.4s, v11.4s, v6.s[1]\n"
+ "fmla v31.4s, v11.4s, v7.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "fmla v28.4s, v8.4s, v7.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "fmla v29.4s, v9.4s, v7.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "fmla v30.4s, v10.4s, v7.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "fmla v31.4s, v11.4s, v7.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v4.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+ "fmla v20.4s, v8.4s, v5.s[3]\n"
+ "fmla v24.4s, v8.4s, v6.s[3]\n"
+ "fmla v28.4s, v8.4s, v7.s[3]\n"
+ "fmla v17.4s, v9.4s, v4.s[3]\n"
+ "fmla v21.4s, v9.4s, v5.s[3]\n"
+ "fmla v25.4s, v9.4s, v6.s[3]\n"
+ "fmla v29.4s, v9.4s, v7.s[3]\n"
+ "fmla v18.4s, v10.4s, v4.s[3]\n"
+ "fmla v22.4s, v10.4s, v5.s[3]\n"
+ "fmla v26.4s, v10.4s, v6.s[3]\n"
+ "fmla v30.4s, v10.4s, v7.s[3]\n"
+ "fmla v19.4s, v11.4s, v4.s[3]\n"
+ "fmla v23.4s, v11.4s, v5.s[3]\n"
+ "fmla v27.4s, v11.4s, v6.s[3]\n"
+ "fmla v31.4s, v11.4s, v7.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v0.s[1]\n"
+ "fmla v20.4s, v8.4s, v1.s[1]\n"
+ "fmla v24.4s, v8.4s, v2.s[1]\n"
+ "fmla v28.4s, v8.4s, v3.s[1]\n"
+ "ldr q8, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v0.s[1]\n"
+ "fmla v21.4s, v9.4s, v1.s[1]\n"
+ "fmla v25.4s, v9.4s, v2.s[1]\n"
+ "fmla v29.4s, v9.4s, v3.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v0.s[1]\n"
+ "fmla v22.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v2.s[1]\n"
+ "fmla v30.4s, v10.4s, v3.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[1]\n"
+ "fmla v23.4s, v11.4s, v1.s[1]\n"
+ "fmla v27.4s, v11.4s, v2.s[1]\n"
+ "fmla v31.4s, v11.4s, v3.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+ "fmla v20.4s, v8.4s, v1.s[3]\n"
+ "fmla v24.4s, v8.4s, v2.s[3]\n"
+ "fmla v28.4s, v8.4s, v3.s[3]\n"
+ "fmla v17.4s, v9.4s, v0.s[3]\n"
+ "fmla v21.4s, v9.4s, v1.s[3]\n"
+ "fmla v25.4s, v9.4s, v2.s[3]\n"
+ "fmla v29.4s, v9.4s, v3.s[3]\n"
+ "fmla v18.4s, v10.4s, v0.s[3]\n"
+ "fmla v22.4s, v10.4s, v1.s[3]\n"
+ "fmla v26.4s, v10.4s, v2.s[3]\n"
+ "fmla v30.4s, v10.4s, v3.s[3]\n"
+ "fmla v19.4s, v11.4s, v0.s[3]\n"
+ "fmla v23.4s, v11.4s, v1.s[3]\n"
+ "fmla v27.4s, v11.4s, v2.s[3]\n"
+ "fmla v31.4s, v11.4s, v3.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr s3, [a_ptr3]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x4\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "ld1r {v14.4s}, [%[minptr]]\n"
+ "ld1r {v15.4s}, [%[maxptr]]\n"
+ "fmax v16.4s, v16.4s, v14.4s\n"
+ "fmax v17.4s, v17.4s, v14.4s\n"
+ "fmax v18.4s, v18.4s, v14.4s\n"
+ "fmax v19.4s, v19.4s, v14.4s\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmax v20.4s, v20.4s, v14.4s\n"
+ "fmax v21.4s, v21.4s, v14.4s\n"
+ "fmax v22.4s, v22.4s, v14.4s\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "fmax v23.4s, v23.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v15.4s\n"
+ "fmin v21.4s, v21.4s, v15.4s\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "fmin v22.4s, v22.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v15.4s\n"
+ "fmax v24.4s, v24.4s, v14.4s\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "fmax v25.4s, v25.4s, v14.4s\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "fmax v26.4s, v26.4s, v14.4s\n"
+ "str q20, [c_ptr1]\n"
+ "fmin v24.4s, v24.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v14.4s\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "fmin v26.4s, v26.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v14.4s\n"
+ "fmax v29.4s, v29.4s, v14.4s\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "fmin v27.4s, v27.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v14.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v14.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "str q24, [c_ptr2]\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ if (use_result_buffer) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp
index da5beef48c..d11a945d27 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,9 +78,9 @@ public:
// Default to the generic kernel
kern_type kernel=a64_hybrid_fp32_mla_4x8;
- hybrid_fp32_mla_4x8(const CPUInfo *ci)
+ hybrid_fp32_mla_4x8(const CPUInfo *)
{
- UNUSED(ci);
+
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp
index db7eb83160..731230364d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,12 +61,23 @@ void a64_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C,
break;
}
- for (int y=0; y<M; y+=8) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const float * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(float);
float *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 8) {
+ if (rows_to_compute % 8) {
+ rows_to_compute = 8 - 1;
+ } else {
+ rows_to_compute = 8;
+ }
+ }
+
for (int x0=0; x0<N; x0+=4ul) {
const long width = std::min((unsigned long)N-x0, 4ul);
long loops = loops_count;
@@ -90,7 +101,7 @@ void a64_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C,
}
const float *biasptr = bias ? bias+x0 : nullbias;
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"ldr q24, [%[biasptr]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
index bdc62ea181..4a9f7985b7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
index 7c08aa2165..6c7e89559c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,9 +32,7 @@
namespace arm_gemm {
-void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *bias, Activation act, bool append) {
- UNUSED(bias);
- UNUSED(act);
+void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool append) {
const int K_stride = ((K + 3) / 4) * 4;
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
@@ -43,12 +41,23 @@ void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, in
const long blocks_count = K / 4;
const long odds_count = K - (blocks_count * 4);
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const int8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(int8_t);
int32_t *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
long loops = loops_count;
@@ -72,7 +81,7 @@ void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, in
c_ptr0 = result_buffer;
}
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"temploadreg0 .req X0\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
index 9f06a48ff5..797ab74498 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,9 +32,7 @@
namespace arm_gemm {
-void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *bias, Activation act, bool append) {
- UNUSED(bias);
- UNUSED(act);
+void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool append) {
const int K_stride = ((K + 3) / 4) * 4;
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
@@ -43,12 +41,23 @@ void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_
const long blocks_count = K / 4;
const long odds_count = K - (blocks_count * 4);
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const int8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(int8_t);
int32_t *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
long loops = loops_count;
@@ -72,7 +81,7 @@ void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_
c_ptr0 = result_buffer;
}
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"cbnz %[append], 1f\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
index 5295650e7b..cdeb5e8b36 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
index e8ed0c311e..91870e2e54 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,10 +32,7 @@
namespace arm_gemm {
-void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *bias, Activation act, bool append) {
- UNUSED(bias);
- UNUSED(act);
-
+void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool append) {
const int K_stride = ((K + 3) / 4) * 4;
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
@@ -44,12 +41,23 @@ void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B,
const long blocks_count = K / 4;
const long odds_count = K - (blocks_count * 4);
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const uint8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(uint8_t);
uint32_t *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
long loops = loops_count;
@@ -73,7 +81,7 @@ void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B,
c_ptr0 = result_buffer;
}
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"temploadreg0 .req X0\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
index 23d919a64c..0436547af0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,10 +32,7 @@
namespace arm_gemm {
-void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *bias, Activation act, bool append) {
- UNUSED(bias);
- UNUSED(act);
-
+void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool append) {
const int K_stride = ((K + 3) / 4) * 4;
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
@@ -44,12 +41,23 @@ void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint
const long blocks_count = K / 4;
const long odds_count = K - (blocks_count * 4);
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const uint8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(uint8_t);
uint32_t *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
long loops = loops_count;
@@ -73,7 +81,7 @@ void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint
c_ptr0 = result_buffer;
}
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"cbnz %[append], 1f\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp
index 0f6c34500d..95fed86c2f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,6 +32,7 @@ namespace arm_gemm {
// Actual kernel implementations
void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *, const bfloat16 *, float *, int, int, int);
class interleaved_bf16fp32_dot_12x8 {
public:
@@ -61,7 +62,12 @@ public:
kern_type kernel=a64_interleaved_bf16fp32_dot_12x8;
- interleaved_bf16fp32_dot_12x8(const CPUInfo *ci) { UNUSED(ci); }
+ interleaved_bf16fp32_dot_12x8(const CPUInfo *ci)
+ {
+ if (ci->get_cpu_model() == CPUModel::X1) {
+ kernel = a64_interleaved_bf16fp32_dot_12x8_x1;
+ }
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp
index 8ce6a601fd..7ffae524dc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -57,13 +57,11 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
"movi v12.4s, #0\n"
"ldr q2, [%[a_ptr], #0x20]\n"
"movi v13.4s, #0\n"
- "ldr q6, [%[b_ptr], #0x20]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"movi v14.4s, #0\n"
- "ldr q3, [%[a_ptr], #0x30]\n"
+ "add %[b_ptr], %[b_ptr], #0x30\n"
"movi v15.4s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
"movi v16.4s, #0\n"
- "add %[b_ptr], %[b_ptr], #0x30\n"
"movi v17.4s, #0\n"
"movi v18.4s, #0\n"
"movi v19.4s, #0\n"
@@ -82,9 +80,11 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
"cbz %[loops], 1f\n"
"2:\n"
".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
+ "ldr q6, [%[b_ptr], #-0x10]\n"
".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
@@ -140,13 +140,13 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n"
".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n"
".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
- "ldr q6, [%[b_ptr], #-0x10]\n"
- "ldr q3, [%[a_ptr], #-0x10]\n"
"b.ne 2b\n"
"1:\n"
"cbz %[tails], 3f\n"
".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
+ "ldr q6, [%[b_ptr], #-0x10]\n"
".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
@@ -178,12 +178,13 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n"
"add %[a_ptr], %[a_ptr], #0x20\n"
".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n"
+ "add %[b_ptr], %[b_ptr], #0x60\n"
".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n"
".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n"
".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n"
".inst 0x4f63f897 // bfdot v23.4s, v4.8h, v3.h[3]\n"
- "ldr q4, [%[b_ptr], #0x30]\n"
+ "ldr q4, [%[b_ptr], #-0x30]\n"
".inst 0x4f42f0ac // bfdot v12.4s, v5.8h, v2.h[0]\n"
".inst 0x4f62f0ad // bfdot v13.4s, v5.8h, v2.h[1]\n"
".inst 0x4f42f8ae // bfdot v14.4s, v5.8h, v2.h[2]\n"
@@ -192,7 +193,7 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
".inst 0x4f63f0b9 // bfdot v25.4s, v5.8h, v3.h[1]\n"
".inst 0x4f43f8ba // bfdot v26.4s, v5.8h, v3.h[2]\n"
".inst 0x4f63f8bb // bfdot v27.4s, v5.8h, v3.h[3]\n"
- "ldr q5, [%[b_ptr], #0x40]\n"
+ "ldr q5, [%[b_ptr], #-0x20]\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
".inst 0x4f62f0d1 // bfdot v17.4s, v6.8h, v2.h[1]\n"
".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
@@ -201,13 +202,12 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n"
".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n"
".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
- "ldr q6, [%[b_ptr], #0x50]\n"
+ "ldr q6, [%[b_ptr], #-0x10]\n"
".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
- "add %[b_ptr], %[b_ptr], #0x60\n"
".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
- "str q8, [%[c_ptr]]\n"
".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
+ "str q8, [%[c_ptr]]\n"
".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
@@ -234,14 +234,17 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
"b 4f\n"
"3:\n"
".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
+ "ldr q6, [%[b_ptr], #-0x10]\n"
".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
+ "add %[b_ptr], %[b_ptr], #0x30\n"
".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
".inst 0x4f61f897 // bfdot v23.4s, v4.8h, v1.h[3]\n"
- "ldr q4, [%[b_ptr]]\n"
+ "ldr q4, [%[b_ptr], #-0x30]\n"
".inst 0x4f40f0ac // bfdot v12.4s, v5.8h, v0.h[0]\n"
".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n"
".inst 0x4f40f8ae // bfdot v14.4s, v5.8h, v0.h[2]\n"
@@ -250,7 +253,7 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n"
".inst 0x4f41f8ba // bfdot v26.4s, v5.8h, v1.h[2]\n"
".inst 0x4f61f8bb // bfdot v27.4s, v5.8h, v1.h[3]\n"
- "ldr q5, [%[b_ptr], #0x10]\n"
+ "ldr q5, [%[b_ptr], #-0x20]\n"
".inst 0x4f40f0d0 // bfdot v16.4s, v6.8h, v0.h[0]\n"
".inst 0x4f60f0d1 // bfdot v17.4s, v6.8h, v0.h[1]\n"
".inst 0x4f40f8d2 // bfdot v18.4s, v6.8h, v0.h[2]\n"
@@ -259,13 +262,12 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
".inst 0x4f61f0dd // bfdot v29.4s, v6.8h, v1.h[1]\n"
".inst 0x4f41f8de // bfdot v30.4s, v6.8h, v1.h[2]\n"
".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
- "ldr q6, [%[b_ptr], #0x20]\n"
+ "ldr q6, [%[b_ptr], #-0x10]\n"
".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n"
- "add %[b_ptr], %[b_ptr], #0x30\n"
".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n"
".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n"
- "str q8, [%[c_ptr]]\n"
".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n"
+ "str q8, [%[c_ptr]]\n"
".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n"
".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp
new file mode 100644
index 0000000000..58a51432fd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "../../bfloat.hpp"
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+ const bfloat16 *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ K /= 2;
+ const long loops_count = (K / 2) - 1;
+ const long tails_count = K % 2;
+
+ for (int yb=0; yb<ablocks; yb++) {
+ const bfloat16 *a_ptr0 = a_ptr;
+ const bfloat16 *b_ptr = Bpanel;
+
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ long loops = loops_count;
+ long tails = tails_count;
+
+ __asm __volatile (
+ "movi v8.4s, #0\n"
+ "ldr q0, [%[a_ptr]]\n"
+ "movi v9.4s, #0\n"
+ "ldr q2, [%[b_ptr]]\n"
+ "movi v10.4s, #0\n"
+ "ldr q1, [%[a_ptr], #0x10]\n"
+ "movi v11.4s, #0\n"
+ "ldr q3, [%[b_ptr], #0x10]\n"
+ "movi v12.4s, #0\n"
+ "ldr q4, [%[b_ptr], #0x20]\n"
+ "movi v13.4s, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x20\n"
+ "movi v14.4s, #0\n"
+ "add %[b_ptr], %[b_ptr], #0x30\n"
+ "movi v15.4s, #0\n"
+ "movi v16.4s, #0\n"
+ "movi v17.4s, #0\n"
+ "movi v18.4s, #0\n"
+ "movi v19.4s, #0\n"
+ "movi v20.4s, #0\n"
+ "movi v21.4s, #0\n"
+ "movi v22.4s, #0\n"
+ "movi v23.4s, #0\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+ ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+ ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+ ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+ ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+ ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+ ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+ "ldr q2, [%[b_ptr]]\n"
+ ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+ ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+ ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+ ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+ ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+ ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+ ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+ ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+ "ldr q3, [%[b_ptr], #0x10]\n"
+ ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+ ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+ ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+ ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+ "ldr q0, [%[a_ptr]]\n"
+ ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+ ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+ ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+ ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+ "ldr q4, [%[b_ptr], #0x20]\n"
+ ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+ "ldr q1, [%[a_ptr], #0x10]\n"
+ ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+ "add %[b_ptr], %[b_ptr], #0x60\n"
+ ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+ ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+ ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+ ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+ ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+ "ldr q2, [%[b_ptr], #-0x30]\n"
+ ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+ ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+ ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+ ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+ ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+ ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+ ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+ ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+ "ldr q3, [%[b_ptr], #-0x20]\n"
+ ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+ ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+ ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+ ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+ "ldr q0, [%[a_ptr], #-0x20]\n"
+ ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+ ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+ ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+ ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+ "ldr q4, [%[b_ptr], #-0x10]\n"
+ "ldr q1, [%[a_ptr], #-0x10]\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[tails], 3f\n"
+ ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+ ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+ ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+ ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+ ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+ ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+ ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+ ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+ "ldr q2, [%[b_ptr]]\n"
+ ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+ ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+ ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+ ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+ ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+ ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+ ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+ ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+ "ldr q3, [%[b_ptr], #0x10]\n"
+ ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+ ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+ ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+ ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+ "ldr q0, [%[a_ptr]]\n"
+ ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+ ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+ ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+ ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+ "ldr q4, [%[b_ptr], #0x20]\n"
+ ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+ "ldr q1, [%[a_ptr], #0x10]\n"
+ ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+ "add %[b_ptr], %[b_ptr], #0x60\n"
+ ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+ ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+ ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+ ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+ ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+ "ldr q2, [%[b_ptr], #-0x30]\n"
+ ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+ ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+ ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+ ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+ ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+ ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+ ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+ ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+ "ldr q3, [%[b_ptr], #-0x20]\n"
+ ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+ ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+ ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+ ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+ "ldr q0, [%[a_ptr], #-0x20]\n"
+ ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+ ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+ ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+ ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+ "ldr q4, [%[b_ptr], #-0x10]\n"
+ ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+ "ldr q1, [%[a_ptr], #-0x10]\n"
+ ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+ ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+ ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+ "str q8, [%[c_ptr]]\n"
+ ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+ ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+ ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+ ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+ ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+ ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+ ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+ ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+ "str q12, [%[c_ptr], #0x10]\n"
+ ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+ ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+ ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+ ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+ ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+ ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+ ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+ ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+ "str q16, [%[c_ptr], #0x20]\n"
+ ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+ ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+ ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+ "str q9, [%[c_ptr], #0x30]\n"
+ ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+ "b 4f\n"
+ "3:\n"
+ ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+ "add %[a_ptr], %[a_ptr], #0x20\n"
+ ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+ "add %[b_ptr], %[b_ptr], #0x30\n"
+ ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+ ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+ ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+ ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+ ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+ ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+ "ldr q2, [%[b_ptr], #-0x30]\n"
+ ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+ ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+ ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+ ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+ ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+ ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+ ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+ ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+ "ldr q3, [%[b_ptr], #-0x20]\n"
+ ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+ ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+ ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+ ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+ "ldr q0, [%[a_ptr], #-0x20]\n"
+ ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+ ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+ ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+ ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+ "ldr q4, [%[b_ptr], #-0x10]\n"
+ ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+ "ldr q1, [%[a_ptr], #-0x10]\n"
+ ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+ ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+ ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+ "str q8, [%[c_ptr]]\n"
+ ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+ ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+ ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+ ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+ ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+ ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+ ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+ ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+ "str q12, [%[c_ptr], #0x10]\n"
+ ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+ ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+ ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+ ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+ ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+ ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+ ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+ ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+ "str q16, [%[c_ptr], #0x20]\n"
+ ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+ ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+ ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+ "str q9, [%[c_ptr], #0x30]\n"
+ ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+ "4:\n"
+ "str q13, [%[c_ptr], #0x40]\n"
+ "str q17, [%[c_ptr], #0x50]\n"
+ "str q10, [%[c_ptr], #0x60]\n"
+ "str q14, [%[c_ptr], #0x70]\n"
+ "str q18, [%[c_ptr], #0x80]\n"
+ "str q11, [%[c_ptr], #0x90]\n"
+ "str q15, [%[c_ptr], #0xa0]\n"
+ "str q19, [%[c_ptr], #0xb0]\n"
+ "str q20, [%[c_ptr], #0xc0]\n"
+ "str q24, [%[c_ptr], #0xd0]\n"
+ "str q28, [%[c_ptr], #0xe0]\n"
+ "str q21, [%[c_ptr], #0xf0]\n"
+ "str q25, [%[c_ptr], #0x100]\n"
+ "str q29, [%[c_ptr], #0x110]\n"
+ "str q22, [%[c_ptr], #0x120]\n"
+ "str q26, [%[c_ptr], #0x130]\n"
+ "str q30, [%[c_ptr], #0x140]\n"
+ "str q23, [%[c_ptr], #0x150]\n"
+ "str q27, [%[c_ptr], #0x160]\n"
+ "str q31, [%[c_ptr], #0x170]\n"
+ "add %[c_ptr], %[c_ptr], #0x180\n"
+ : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [loops] "+r" (loops), [tails] "+r" (tails)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+ );
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp
index 7f928fa727..7fac59947e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,7 +61,10 @@ public:
kern_type kernel=a64_interleaved_bf16fp32_mmla_12x8;
- interleaved_bf16fp32_mmla_12x8(const CPUInfo *ci) { UNUSED(ci); }
+ interleaved_bf16fp32_mmla_12x8(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp
index be87f442ea..7f0eff29af 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -59,42 +59,65 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *
"movi v13.4s, #0\n"
"ldr q6, [%[b_ptr], #0x20]\n"
"movi v14.4s, #0\n"
- "ldr q3, [%[a_ptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x40]\n"
"movi v15.4s, #0\n"
- "ldr q7, [%[b_ptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x40]\n"
"movi v16.4s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x80]\n"
"movi v17.4s, #0\n"
- "add %[b_ptr], %[b_ptr], #0x40\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x80]\n"
"movi v18.4s, #0\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0xc0]\n"
"movi v19.4s, #0\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0xc0]\n"
"movi v20.4s, #0\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x100]\n"
"movi v21.4s, #0\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x100]\n"
"movi v22.4s, #0\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x140]\n"
"movi v23.4s, #0\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x140]\n"
"movi v24.4s, #0\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x180]\n"
"movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x180]\n"
"movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n"
"movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x1c0]\n"
"movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n"
"movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n"
"movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n"
"movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ "add %[b_ptr], %[b_ptr], #0x40\n"
"cbz %[loops], 1f\n"
"2:\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "subs %[loops], %[loops], #0x1\n"
+ "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n"
".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
"ldr q4, [%[b_ptr]]\n"
- ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n"
".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
"ldr q5, [%[b_ptr], #0x10]\n"
".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n"
".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n"
".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
"ldr q6, [%[b_ptr], #0x20]\n"
@@ -151,18 +174,18 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *
".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
"ldr q2, [%[a_ptr], #-0x20]\n"
".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
- "ldr q7, [%[b_ptr], #-0x10]\n"
- "ldr q3, [%[a_ptr], #-0x10]\n"
"b.ne 2b\n"
"1:\n"
"cbz %[tails], 3f\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
+ "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
- ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
- "ldr q4, [%[b_ptr]]\n"
".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
+ ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
+ "ldr q4, [%[b_ptr]]\n"
".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
"ldr q5, [%[b_ptr], #0x10]\n"
@@ -268,13 +291,15 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *
"b 4f\n"
"3:\n"
".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
- "add %[b_ptr], %[b_ptr], #0x80\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
+ "add %[b_ptr], %[b_ptr], #0x80\n"
".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
"ldr q4, [%[b_ptr], #-0x80]\n"
- ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp
index f669b870c6..7bfb2291a9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,9 +61,9 @@ public:
kern_type kernel=a64_interleaved_s8s32_mmla_12x8;
- interleaved_s8s32_mmla_12x8(const CPUInfo *ci)
+ interleaved_s8s32_mmla_12x8(const CPUInfo *)
{
- UNUSED(ci);
+
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp
index 49dbdb866e..7953510aa7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -59,13 +59,11 @@ void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel,
"movi v13.4s, #0\n"
"ldr q6, [%[b_ptr], #0x20]\n"
"movi v14.4s, #0\n"
- "ldr q3, [%[a_ptr], #0x30]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"movi v15.4s, #0\n"
- "ldr q7, [%[b_ptr], #0x30]\n"
+ "add %[b_ptr], %[b_ptr], #0x40\n"
"movi v16.4s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
"movi v17.4s, #0\n"
- "add %[b_ptr], %[b_ptr], #0x40\n"
"movi v18.4s, #0\n"
"movi v19.4s, #0\n"
"movi v20.4s, #0\n"
@@ -83,12 +81,14 @@ void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel,
"cbz %[loops], 1f\n"
"2:\n"
".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
- "subs %[loops], %[loops], #0x1\n"
+ "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
"ldr q4, [%[b_ptr]]\n"
- ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
@@ -151,18 +151,18 @@ void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel,
".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
"ldr q2, [%[a_ptr], #-0x20]\n"
".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
- "ldr q7, [%[b_ptr], #-0x10]\n"
- "ldr q3, [%[a_ptr], #-0x10]\n"
"b.ne 2b\n"
"1:\n"
"cbz %[tails], 3f\n"
".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
+ "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
- ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
- "ldr q4, [%[b_ptr]]\n"
".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
+ ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
+ "ldr q4, [%[b_ptr]]\n"
".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
"ldr q5, [%[b_ptr], #0x10]\n"
@@ -268,13 +268,15 @@ void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel,
"b 4f\n"
"3:\n"
".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
- "add %[b_ptr], %[b_ptr], #0x80\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
+ "add %[b_ptr], %[b_ptr], #0x80\n"
".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
"ldr q4, [%[b_ptr], #-0x80]\n"
- ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp
index d66edd832a..d493517cf1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,9 +61,9 @@ public:
kern_type kernel=a64_interleaved_u8u32_mmla_12x8;
- interleaved_u8u32_mmla_12x8(const CPUInfo *ci)
+ interleaved_u8u32_mmla_12x8(const CPUInfo *)
{
- UNUSED(ci);
+
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
index e182a425f4..dcd15f0345 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -59,13 +59,11 @@ void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpane
"movi v13.4s, #0\n"
"ldr q6, [%[b_ptr], #0x20]\n"
"movi v14.4s, #0\n"
- "ldr q3, [%[a_ptr], #0x30]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"movi v15.4s, #0\n"
- "ldr q7, [%[b_ptr], #0x30]\n"
+ "add %[b_ptr], %[b_ptr], #0x40\n"
"movi v16.4s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
"movi v17.4s, #0\n"
- "add %[b_ptr], %[b_ptr], #0x40\n"
"movi v18.4s, #0\n"
"movi v19.4s, #0\n"
"movi v20.4s, #0\n"
@@ -83,12 +81,14 @@ void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpane
"cbz %[loops], 1f\n"
"2:\n"
".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
- "subs %[loops], %[loops], #0x1\n"
+ "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
"ldr q4, [%[b_ptr]]\n"
- ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
@@ -151,18 +151,18 @@ void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpane
".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
"ldr q2, [%[a_ptr], #-0x20]\n"
".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
- "ldr q7, [%[b_ptr], #-0x10]\n"
- "ldr q3, [%[a_ptr], #-0x10]\n"
"b.ne 2b\n"
"1:\n"
"cbz %[tails], 3f\n"
".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
+ "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
- ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
- "ldr q4, [%[b_ptr]]\n"
".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
+ ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
+ "ldr q4, [%[b_ptr]]\n"
".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
"ldr q5, [%[b_ptr], #0x10]\n"
@@ -268,13 +268,15 @@ void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpane
"b 4f\n"
"3:\n"
".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
- "add %[b_ptr], %[b_ptr], #0x80\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
+ "add %[b_ptr], %[b_ptr], #0x80\n"
".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
"ldr q4, [%[b_ptr], #-0x80]\n"
- ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4.hpp
index a86e8ec068..d7bf43deca 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4.hpp
@@ -25,6 +25,9 @@
#ifdef __aarch64__
+
+
+
namespace arm_gemm
{
@@ -75,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=a64_native_fp32_mla_16x4;
- native_fp32_mla_16x4(const CPUInfo *ci) { UNUSED(ci); }
+ native_fp32_mla_16x4(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
index ddc97b47f4..3eff767d6c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
@@ -34,6 +34,7 @@ void a64_sgemm_asimd_12x8(const float *, const float *, float *, int, int, int);
void a64_sgemm_asimd_12x8_a53(const float *, const float *, float *, int, int, int);
void a64_sgemm_asimd_12x8_a55(const float *, const float *, float *, int, int, int);
void a64_sgemm_asimd_12x8_a55r1(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_x1(const float *, const float *, float *, int, int, int);
// 12x8 SGEMM "strategy" class.
//
@@ -83,6 +84,10 @@ public:
kernel = a64_sgemm_asimd_12x8_a55r1;
break;
+ case CPUModel::X1:
+ kernel = a64_sgemm_asimd_12x8_x1;
+ break;
+
default:
/* Generic kernel is initialized by default. */
break;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp
new file mode 100644
index 0000000000..63fdf4df9f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm {
+
+void a64_sgemm_asimd_12x8_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for (int yb=0; yb<ablocks; yb++) {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K+1)/2) - 1;
+
+ register float32x4_t a0 asm("v0");
+ register float32x4_t a1 asm("v1");
+ register float32x4_t b0 asm("v2");
+ register float32x4_t b1 asm("v3");
+ register float32x4_t b2 asm("v4");
+
+ __asm __volatile (
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v13.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v14.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]")
+ "movi v16.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v17.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v19.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v20.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #256]")
+ "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ // Loop proper
+ "1:\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ ASM_PREFETCH("[%[a_ptr], #320]")
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #448]")
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "ldr %q[a0], [%[a_ptr], #32]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[a1], [%[a_ptr], #48]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ ASM_PREFETCH("[%[b_ptr], #512]")
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "bne 1b\n"
+
+ // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "ldr %q[a0], [%[a_ptr], #-32]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[a1], [%[a_ptr], #-16]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "ldr %q[b2], [%[b_ptr], #-16]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ // Common tail
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [a0] "+w" (a0), [a1] "+w" (a1),
+ [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+ : [oddk] "r" (oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+ );
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
index a7162c9f5b..d24bf5fa10 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017,2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,7 +69,7 @@ public:
kern_type kernel = a64_sgemv_pretransposed;
- sgemv_pretransposed(const CPUInfo *ci) { UNUSED(ci); }
+ sgemv_pretransposed(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
index 36f84d89fc..7592798b0d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017,2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,7 +49,7 @@ public:
kern_type kernel=a64_sgemv_trans;
- sgemv_trans(const CPUInfo *ci) { UNUSED(ci); }
+ sgemv_trans(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp
index 352a147282..477f3005e6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp
@@ -75,7 +75,10 @@ public:
// Default to the generic kernel
kern_type kernel=a64_smallK_hybrid_fp32_mla_4x6;
- smallK_hybrid_fp32_mla_4x6(const CPUInfo *ci) { UNUSED(ci); }
+ smallK_hybrid_fp32_mla_4x6(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp
index c5d39cbc87..1a0358b787 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp
@@ -25,8 +25,6 @@
#ifdef __aarch64__
-
-
namespace arm_gemm
{
@@ -77,7 +75,10 @@ public:
// Default to the generic kernel
kern_type kernel=a64_smallK_hybrid_fp32_mla_4x8;
- smallK_hybrid_fp32_mla_4x8(const CPUInfo *ci) { UNUSED(ci); }
+ smallK_hybrid_fp32_mla_4x8(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp
index ac9a8d257c..efc109fb34 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_bf16fp32_dot_4VLx4;
- hybrid_bf16fp32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+ hybrid_bf16fp32_dot_4VLx4(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp
index 1ee7b1cf55..f16f452739 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,12 +62,23 @@ void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
break;
}
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const bfloat16 * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(bfloat16);
float *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
long loops = loops_count;
@@ -79,7 +90,7 @@ void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
const unsigned long ldcb = ldc * sizeof(float);
const float *biasptr = bias ? bias+x0 : nullbias;
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"whilelt p6.h, %[temp], %[leftovers]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp
index d889f99f8f..551c6f3a8c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_bf16fp32_mmla_4VLx4;
- hybrid_bf16fp32_mmla_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+ hybrid_bf16fp32_mmla_4VLx4(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp
index e3debe508d..4b67d747e2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,12 +62,23 @@ void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *A, int lda, const bfloat16 *
break;
}
- for (int y=0; y<M; y+=8) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const bfloat16 * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(bfloat16);
float *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 8) {
+ if (rows_to_compute % 8) {
+ rows_to_compute = 8 - 1;
+ } else {
+ rows_to_compute = 8;
+ }
+ }
+
for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) {
const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>()));
long loops = loops_count;
@@ -79,7 +90,7 @@ void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *A, int lda, const bfloat16 *
const unsigned long ldcb = ldc * sizeof(float);
const float *biasptr = bias ? bias+x0 : nullbias;
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"whilelt p6.h, %[temp], %[leftovers]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp
index affcafe4aa..6f26fd1404 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_bf16fp32_mmla_6VLx2;
- hybrid_bf16fp32_mmla_6VLx2(const CPUInfo *ci) { UNUSED(ci); }
+ hybrid_bf16fp32_mmla_6VLx2(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp
index 07ecbf35cd..fb943fe6fe 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,12 +62,23 @@ void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *A, int lda, const bfloat16 *
break;
}
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const bfloat16 * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(bfloat16);
float *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=(3 * get_vector_length<float>())) {
const long width = std::min((unsigned long)N-x0, (3 * get_vector_length<float>()));
long loops = loops_count;
@@ -79,7 +90,7 @@ void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *A, int lda, const bfloat16 *
const unsigned long ldcb = ldc * sizeof(float);
const float *biasptr = bias ? bias+x0 : nullbias;
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"whilelt p6.h, %[temp], %[leftovers]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp
index af8babd113..0bf4492fdc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_bf16fp32_mmla_8VLx2;
- hybrid_bf16fp32_mmla_8VLx2(const CPUInfo *ci) { UNUSED(ci); }
+ hybrid_bf16fp32_mmla_8VLx2(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp
index 73196164a7..3f201f0656 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,12 +62,23 @@ void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *A, int lda, const bfloat16 *
break;
}
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const bfloat16 * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(bfloat16);
float *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
long loops = loops_count;
@@ -79,7 +90,7 @@ void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *A, int lda, const bfloat16 *
const unsigned long ldcb = ldc * sizeof(float);
const float *biasptr = bias ? bias+x0 : nullbias;
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"whilelt p6.h, %[temp], %[leftovers]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
index 28ef8071c2..fb27b7e103 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_fp16_mla_4VLx4;
- hybrid_fp16_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+ hybrid_fp16_mla_4VLx4(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
index 2998f33d87..3aef916ad2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,12 +61,23 @@ void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16
break;
}
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const __fp16 * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(__fp16);
__fp16 *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
long loops = loops_count;
@@ -78,7 +89,7 @@ void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16
const unsigned long ldcb = ldc * sizeof(__fp16);
const __fp16 *biasptr = bias ? bias+x0 : nullbias;
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"whilelt p6.h, %[temp], %[leftovers]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
index 8e3c17917b..28e00305f7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_fp32_mla_4VLx4;
- hybrid_fp32_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+ hybrid_fp32_mla_4VLx4(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
index 855d27a151..6b55959e2a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,12 +61,23 @@ void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C
break;
}
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const float * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(float);
float *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
long loops = loops_count;
@@ -78,7 +89,7 @@ void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C
const unsigned long ldcb = ldc * sizeof(float);
const float *biasptr = bias ? bias+x0 : nullbias;
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"whilelt p6.s, %[temp], %[leftovers]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp
new file mode 100644
index 0000000000..4bdf4e1d80
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_fp32_mmla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+
+class hybrid_fp32_mmla_4VLx4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 8;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<float>() * 2;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 2;
+ }
+
+ static constexpr bool supports_append()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return true;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 4, 4, 2> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_fp32_mmla_4VLx4;
+
+ hybrid_fp32_mmla_4VLx4(const CPUInfo *)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp
new file mode 100644
index 0000000000..d8ed307c4b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp
@@ -0,0 +1,3459 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mmla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
+ const int K_stride = ((K + 1) / 2) * 2;
+ const long loops_count = ((K + 4) / 8) - 1;
+ K -= loops_count * 8;
+ const long regs_count = (K / 4) - 1;
+ K -= (regs_count + 1) * 4;
+ const long leftovers = K;
+ const long blocks_count = (K + 1) / 2;
+ float nullbias[128];
+ if (!append && !bias) {
+ memset(nullbias, 0, (2 * get_vector_length<float>() * sizeof(float)));
+ }
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ const float * const minptr = &minval;
+ const float * const maxptr = &maxval;
+
+ switch(act.type)
+ {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ minval = 0.0f;
+ break;
+ }
+
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
+ const float * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(float);
+
+ float *c_ptr0 = C + (y * ldc);
+
+ rows_to_compute = M-y;
+ if (rows_to_compute > 8) {
+ if (rows_to_compute % 8) {
+ rows_to_compute = 8 - 1;
+ } else {
+ rows_to_compute = 8;
+ }
+ }
+
+ for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) {
+ const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>()));
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = blocks_count;
+ const float *a_ptr0 = a_ptr0_base;
+ const float *b_ptr0 = B + (K_stride * x0);
+ const unsigned long ldcb = ldc * sizeof(float);
+ const float *biasptr = bias ? bias+x0 : nullbias;
+
+ switch(rows_to_compute) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "cbnz %[append], 1f\n"
+ "mov z1.s, #0\n"
+ "ld1w z15.s, p0/z, [%[biasptr]]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "zip1 z16.s, z15.s, z15.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "zip2 z17.s, z15.s, z15.s\n"
+ "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "zip1 z18.s, z15.s, z15.s\n"
+ "zip2 z19.s, z15.s, z15.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "mov z14.s, #0\n"
+ "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+ "mov z1.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "zip1 z16.s, z13.s, z14.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "zip2 z17.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "mov z14.s, #0\n"
+ "zip1 z18.s, z13.s, z14.s\n"
+ "zip2 z19.s, z13.s, z14.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z5.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "mov z1.s, #0\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z5.s, #0\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "mov z1.s, #0\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #2\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "b 5f\n"
+ "4:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z5.s, #0\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "5:\n"
+ "ld1rw z14.s, p7/z, [%[minptr]]\n"
+ "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+ "fmax z16.s, p7/m, z16.s, z14.s\n"
+ "fmax z17.s, p7/m, z17.s, z14.s\n"
+ "fmax z18.s, p7/m, z18.s, z14.s\n"
+ "fmax z19.s, p7/m, z19.s, z14.s\n"
+ "fmin z16.s, p7/m, z16.s, z15.s\n"
+ "fmin z17.s, p7/m, z17.s, z15.s\n"
+ "fmin z18.s, p7/m, z18.s, z15.s\n"
+ "fmin z19.s, p7/m, z19.s, z15.s\n"
+ "uzp1 z0.s, z16.s, z17.s\n"
+ "uzp1 z1.s, z18.s, z19.s\n"
+ "st1w z0.s, p0, [%[c_ptr0]]\n"
+ "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "cbnz %[append], 1f\n"
+ "ld1w z15.s, p0/z, [%[biasptr]]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z15.s, z15.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "zip2 z17.s, z15.s, z15.s\n"
+ "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "zip1 z18.s, z15.s, z15.s\n"
+ "zip2 z19.s, z15.s, z15.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z14.s, p0/z, [c_ptr1]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z13.s, z14.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "zip2 z17.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "zip1 z18.s, z13.s, z14.s\n"
+ "zip2 z19.s, z13.s, z14.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "addvl a_ptr1, a_ptr1, #2\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #2\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "b 5f\n"
+ "4:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "5:\n"
+ "ld1rw z14.s, p7/z, [%[minptr]]\n"
+ "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+ "fmax z16.s, p7/m, z16.s, z14.s\n"
+ "fmax z17.s, p7/m, z17.s, z14.s\n"
+ "fmax z18.s, p7/m, z18.s, z14.s\n"
+ "fmax z19.s, p7/m, z19.s, z14.s\n"
+ "fmin z16.s, p7/m, z16.s, z15.s\n"
+ "fmin z17.s, p7/m, z17.s, z15.s\n"
+ "fmin z18.s, p7/m, z18.s, z15.s\n"
+ "fmin z19.s, p7/m, z19.s, z15.s\n"
+ "uzp1 z0.s, z16.s, z17.s\n"
+ "uzp2 z1.s, z16.s, z17.s\n"
+ "uzp1 z2.s, z18.s, z19.s\n"
+ "uzp2 z3.s, z18.s, z19.s\n"
+ "st1w z0.s, p0, [%[c_ptr0]]\n"
+ "st1w z1.s, p0, [c_ptr1]\n"
+ "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #2\n"
+ "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "cbnz %[append], 1f\n"
+ "mov z3.s, #0\n"
+ "ld1w z15.s, p0/z, [%[biasptr]]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z15.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "zip2 z17.s, z15.s, z15.s\n"
+ "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z18.s, z15.s, z15.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "zip2 z19.s, z15.s, z15.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "mov z20.d, z16.d\n"
+ "mov z21.d, z17.d\n"
+ "mov z22.d, z18.d\n"
+ "mov z23.d, z19.d\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "mov z3.s, #0\n"
+ "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z14.s, p0/z, [c_ptr1]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z13.s, z14.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "zip2 z17.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "zip1 z18.s, z13.s, z14.s\n"
+ "zip2 z19.s, z13.s, z14.s\n"
+ "ld1w z13.s, p0/z, [c_ptr2]\n"
+ "mov z14.s, #0\n"
+ "zip1 z20.s, z13.s, z14.s\n"
+ "zip2 z21.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mov z14.s, #0\n"
+ "zip1 z22.s, z13.s, z14.s\n"
+ "zip2 z23.s, z13.s, z14.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z7.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "mov z3.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl a_ptr2, a_ptr2, #2\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z7.s, #0\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "mov z3.s, #0\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #2\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl a_ptr1, a_ptr1, #2\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "b 5f\n"
+ "4:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z7.s, #0\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "5:\n"
+ "ld1rw z14.s, p7/z, [%[minptr]]\n"
+ "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+ "fmax z16.s, p7/m, z16.s, z14.s\n"
+ "fmax z17.s, p7/m, z17.s, z14.s\n"
+ "fmax z18.s, p7/m, z18.s, z14.s\n"
+ "fmax z19.s, p7/m, z19.s, z14.s\n"
+ "fmin z16.s, p7/m, z16.s, z15.s\n"
+ "fmin z17.s, p7/m, z17.s, z15.s\n"
+ "fmin z18.s, p7/m, z18.s, z15.s\n"
+ "fmin z19.s, p7/m, z19.s, z15.s\n"
+ "fmax z20.s, p7/m, z20.s, z14.s\n"
+ "uzp1 z0.s, z16.s, z17.s\n"
+ "uzp2 z1.s, z16.s, z17.s\n"
+ "uzp1 z2.s, z18.s, z19.s\n"
+ "uzp2 z3.s, z18.s, z19.s\n"
+ "st1w z0.s, p0, [%[c_ptr0]]\n"
+ "fmin z20.s, p7/m, z20.s, z15.s\n"
+ "fmax z21.s, p7/m, z21.s, z14.s\n"
+ "fmax z22.s, p7/m, z22.s, z14.s\n"
+ "st1w z1.s, p0, [c_ptr1]\n"
+ "fmax z23.s, p7/m, z23.s, z14.s\n"
+ "fmin z21.s, p7/m, z21.s, z15.s\n"
+ "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "fmin z22.s, p7/m, z22.s, z15.s\n"
+ "addvl %[c_ptr0], %[c_ptr0], #2\n"
+ "fmin z23.s, p7/m, z23.s, z15.s\n"
+ "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "uzp1 z4.s, z20.s, z21.s\n"
+ "uzp1 z5.s, z22.s, z23.s\n"
+ "st1w z4.s, p0, [c_ptr2]\n"
+ "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "cbnz %[append], 1f\n"
+ "ld1w z15.s, p0/z, [%[biasptr]]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z15.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "zip2 z17.s, z15.s, z15.s\n"
+ "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z18.s, z15.s, z15.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "zip2 z19.s, z15.s, z15.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z20.d, z16.d\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z21.d, z17.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "mov z22.d, z18.d\n"
+ "mov z23.d, z19.d\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z14.s, p0/z, [c_ptr1]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z13.s, z14.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "zip2 z17.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "zip1 z18.s, z13.s, z14.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "zip2 z19.s, z13.s, z14.s\n"
+ "ld1w z13.s, p0/z, [c_ptr2]\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "ld1w z14.s, p0/z, [c_ptr3]\n"
+ "zip1 z20.s, z13.s, z14.s\n"
+ "zip2 z21.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "zip1 z22.s, z13.s, z14.s\n"
+ "zip2 z23.s, z13.s, z14.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "addvl a_ptr2, a_ptr2, #2\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "addvl a_ptr3, a_ptr3, #2\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #2\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "addvl a_ptr1, a_ptr1, #2\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "b 5f\n"
+ "4:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "addvl a_ptr3, a_ptr3, #1\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "5:\n"
+ "ld1rw z14.s, p7/z, [%[minptr]]\n"
+ "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+ "fmax z16.s, p7/m, z16.s, z14.s\n"
+ "fmax z17.s, p7/m, z17.s, z14.s\n"
+ "fmax z18.s, p7/m, z18.s, z14.s\n"
+ "fmax z19.s, p7/m, z19.s, z14.s\n"
+ "fmin z16.s, p7/m, z16.s, z15.s\n"
+ "fmin z17.s, p7/m, z17.s, z15.s\n"
+ "fmin z18.s, p7/m, z18.s, z15.s\n"
+ "fmin z19.s, p7/m, z19.s, z15.s\n"
+ "fmax z20.s, p7/m, z20.s, z14.s\n"
+ "uzp1 z0.s, z16.s, z17.s\n"
+ "uzp2 z1.s, z16.s, z17.s\n"
+ "uzp1 z2.s, z18.s, z19.s\n"
+ "uzp2 z3.s, z18.s, z19.s\n"
+ "st1w z0.s, p0, [%[c_ptr0]]\n"
+ "fmin z20.s, p7/m, z20.s, z15.s\n"
+ "fmax z21.s, p7/m, z21.s, z14.s\n"
+ "fmax z22.s, p7/m, z22.s, z14.s\n"
+ "st1w z1.s, p0, [c_ptr1]\n"
+ "fmax z23.s, p7/m, z23.s, z14.s\n"
+ "fmin z21.s, p7/m, z21.s, z15.s\n"
+ "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "fmin z22.s, p7/m, z22.s, z15.s\n"
+ "addvl %[c_ptr0], %[c_ptr0], #2\n"
+ "fmin z23.s, p7/m, z23.s, z15.s\n"
+ "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "uzp1 z4.s, z20.s, z21.s\n"
+ "uzp2 z5.s, z20.s, z21.s\n"
+ "uzp1 z6.s, z22.s, z23.s\n"
+ "st1w z4.s, p0, [c_ptr2]\n"
+ "uzp2 z7.s, z22.s, z23.s\n"
+ "st1w z5.s, p0, [c_ptr3]\n"
+ "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ case 5:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "a_ptr4 .req X3\n"
+ "c_ptr1 .req X4\n"
+ "c_ptr2 .req X5\n"
+ "c_ptr3 .req X6\n"
+ "c_ptr4 .req X7\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "add a_ptr4, a_ptr3, %[lda]\n"
+ "add c_ptr4, c_ptr3, %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "cbnz %[append], 1f\n"
+ "mov z5.s, #0\n"
+ "ld1w z15.s, p0/z, [%[biasptr]]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z15.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "zip2 z17.s, z15.s, z15.s\n"
+ "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z18.s, z15.s, z15.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "zip2 z19.s, z15.s, z15.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z20.d, z16.d\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z21.d, z17.d\n"
+ "add a_ptr4, a_ptr4, #0x10\n"
+ "mov z22.d, z18.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "mov z23.d, z19.d\n"
+ "mov z24.d, z16.d\n"
+ "mov z25.d, z17.d\n"
+ "mov z26.d, z18.d\n"
+ "mov z27.d, z19.d\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "mov z5.s, #0\n"
+ "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z14.s, p0/z, [c_ptr1]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z13.s, z14.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "zip2 z17.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "zip1 z18.s, z13.s, z14.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "zip2 z19.s, z13.s, z14.s\n"
+ "ld1w z13.s, p0/z, [c_ptr2]\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "ld1w z14.s, p0/z, [c_ptr3]\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr4, a_ptr4, #0x10\n"
+ "zip1 z20.s, z13.s, z14.s\n"
+ "zip2 z21.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "zip1 z22.s, z13.s, z14.s\n"
+ "zip2 z23.s, z13.s, z14.s\n"
+ "ld1w z13.s, p0/z, [c_ptr4]\n"
+ "mov z14.s, #0\n"
+ "zip1 z24.s, z13.s, z14.s\n"
+ "zip2 z25.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
+ "mov z14.s, #0\n"
+ "zip1 z26.s, z13.s, z14.s\n"
+ "zip2 z27.s, z13.s, z14.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z9.s, #0\n"
+ "add a_ptr4, a_ptr4, #0x20\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "trn2 z10.d, z8.d, z9.d\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z2.d, z8.d, z9.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "mov z5.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl a_ptr3, a_ptr3, #2\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z9.s, #0\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "trn2 z10.d, z8.d, z9.d\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z2.d, z8.d, z9.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "addvl a_ptr4, a_ptr4, #2\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #2\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "mov z5.s, #0\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "addvl a_ptr1, a_ptr1, #2\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl a_ptr2, a_ptr2, #2\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "b 5f\n"
+ "4:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1rqw z8.s, p6/z, [a_ptr4]\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "addvl a_ptr3, a_ptr3, #1\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "addvl a_ptr4, a_ptr4, #1\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z9.s, #0\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "trn1 z2.d, z8.d, z9.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z10.d, z8.d, z9.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "5:\n"
+ "ld1rw z14.s, p7/z, [%[minptr]]\n"
+ "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+ "fmax z16.s, p7/m, z16.s, z14.s\n"
+ "fmax z17.s, p7/m, z17.s, z14.s\n"
+ "fmax z18.s, p7/m, z18.s, z14.s\n"
+ "fmax z19.s, p7/m, z19.s, z14.s\n"
+ "fmin z16.s, p7/m, z16.s, z15.s\n"
+ "fmin z17.s, p7/m, z17.s, z15.s\n"
+ "fmin z18.s, p7/m, z18.s, z15.s\n"
+ "fmin z19.s, p7/m, z19.s, z15.s\n"
+ "fmax z20.s, p7/m, z20.s, z14.s\n"
+ "uzp1 z0.s, z16.s, z17.s\n"
+ "uzp2 z1.s, z16.s, z17.s\n"
+ "uzp1 z2.s, z18.s, z19.s\n"
+ "uzp2 z3.s, z18.s, z19.s\n"
+ "st1w z0.s, p0, [%[c_ptr0]]\n"
+ "fmin z20.s, p7/m, z20.s, z15.s\n"
+ "fmax z21.s, p7/m, z21.s, z14.s\n"
+ "fmax z22.s, p7/m, z22.s, z14.s\n"
+ "st1w z1.s, p0, [c_ptr1]\n"
+ "fmax z23.s, p7/m, z23.s, z14.s\n"
+ "fmax z24.s, p7/m, z24.s, z14.s\n"
+ "fmin z21.s, p7/m, z21.s, z15.s\n"
+ "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "fmin z22.s, p7/m, z22.s, z15.s\n"
+ "addvl %[c_ptr0], %[c_ptr0], #2\n"
+ "fmin z23.s, p7/m, z23.s, z15.s\n"
+ "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "uzp1 z4.s, z20.s, z21.s\n"
+ "uzp2 z5.s, z20.s, z21.s\n"
+ "fmin z24.s, p7/m, z24.s, z15.s\n"
+ "uzp1 z6.s, z22.s, z23.s\n"
+ "st1w z4.s, p0, [c_ptr2]\n"
+ "uzp2 z7.s, z22.s, z23.s\n"
+ "fmax z25.s, p7/m, z25.s, z14.s\n"
+ "fmax z26.s, p7/m, z26.s, z14.s\n"
+ "st1w z5.s, p0, [c_ptr3]\n"
+ "fmax z27.s, p7/m, z27.s, z14.s\n"
+ "fmin z25.s, p7/m, z25.s, z15.s\n"
+ "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "fmin z26.s, p7/m, z26.s, z15.s\n"
+ "fmin z27.s, p7/m, z27.s, z15.s\n"
+ "uzp1 z8.s, z24.s, z25.s\n"
+ "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "uzp1 z9.s, z26.s, z27.s\n"
+ "st1w z8.s, p0, [c_ptr4]\n"
+ "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq a_ptr4\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq c_ptr4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+ );
+ break;
+ case 6:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "a_ptr4 .req X3\n"
+ "a_ptr5 .req X4\n"
+ "c_ptr1 .req X5\n"
+ "c_ptr2 .req X6\n"
+ "c_ptr3 .req X7\n"
+ "c_ptr4 .req X8\n"
+ "c_ptr5 .req X9\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "add a_ptr4, a_ptr3, %[lda]\n"
+ "add c_ptr4, c_ptr3, %[ldc]\n"
+ "add a_ptr5, a_ptr4, %[lda]\n"
+ "add c_ptr5, c_ptr4, %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "cbnz %[append], 1f\n"
+ "ld1w z15.s, p0/z, [%[biasptr]]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z15.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "zip2 z17.s, z15.s, z15.s\n"
+ "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z18.s, z15.s, z15.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr5]\n"
+ "zip2 z19.s, z15.s, z15.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.d, z16.d\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z21.d, z17.d\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z22.d, z18.d\n"
+ "add a_ptr4, a_ptr4, #0x10\n"
+ "mov z23.d, z19.d\n"
+ "add a_ptr5, a_ptr5, #0x10\n"
+ "mov z24.d, z16.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "mov z25.d, z17.d\n"
+ "mov z26.d, z18.d\n"
+ "mov z27.d, z19.d\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z14.s, p0/z, [c_ptr1]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z13.s, z14.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "zip2 z17.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "zip1 z18.s, z13.s, z14.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr5]\n"
+ "zip2 z19.s, z13.s, z14.s\n"
+ "ld1w z13.s, p0/z, [c_ptr2]\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "ld1w z14.s, p0/z, [c_ptr3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr4, a_ptr4, #0x10\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "zip1 z20.s, z13.s, z14.s\n"
+ "add a_ptr5, a_ptr5, #0x10\n"
+ "zip2 z21.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "zip1 z22.s, z13.s, z14.s\n"
+ "zip2 z23.s, z13.s, z14.s\n"
+ "ld1w z13.s, p0/z, [c_ptr4]\n"
+ "ld1w z14.s, p0/z, [c_ptr5]\n"
+ "zip1 z24.s, z13.s, z14.s\n"
+ "zip2 z25.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
+ "zip1 z26.s, z13.s, z14.s\n"
+ "zip2 z27.s, z13.s, z14.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1rqw z9.s, p7/z, [a_ptr5]\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "trn2 z10.d, z8.d, z9.d\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "add a_ptr4, a_ptr4, #0x20\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "add a_ptr5, a_ptr5, #0x20\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z2.d, z8.d, z9.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl a_ptr3, a_ptr3, #2\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1rqw z9.s, p7/z, [a_ptr5]\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "trn2 z10.d, z8.d, z9.d\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z2.d, z8.d, z9.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "addvl a_ptr4, a_ptr4, #2\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "addvl a_ptr5, a_ptr5, #2\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #2\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl a_ptr1, a_ptr1, #2\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "addvl a_ptr2, a_ptr2, #2\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "b 5f\n"
+ "4:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1rqw z8.s, p6/z, [a_ptr4]\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "addvl a_ptr3, a_ptr3, #1\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "addvl a_ptr4, a_ptr4, #1\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1rqw z9.s, p6/z, [a_ptr5]\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "addvl a_ptr5, a_ptr5, #1\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "trn1 z2.d, z8.d, z9.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z10.d, z8.d, z9.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "5:\n"
+ "ld1rw z14.s, p7/z, [%[minptr]]\n"
+ "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+ "fmax z16.s, p7/m, z16.s, z14.s\n"
+ "fmax z17.s, p7/m, z17.s, z14.s\n"
+ "fmax z18.s, p7/m, z18.s, z14.s\n"
+ "fmax z19.s, p7/m, z19.s, z14.s\n"
+ "fmin z16.s, p7/m, z16.s, z15.s\n"
+ "fmin z17.s, p7/m, z17.s, z15.s\n"
+ "fmin z18.s, p7/m, z18.s, z15.s\n"
+ "fmin z19.s, p7/m, z19.s, z15.s\n"
+ "fmax z20.s, p7/m, z20.s, z14.s\n"
+ "uzp1 z0.s, z16.s, z17.s\n"
+ "uzp2 z1.s, z16.s, z17.s\n"
+ "uzp1 z2.s, z18.s, z19.s\n"
+ "uzp2 z3.s, z18.s, z19.s\n"
+ "st1w z0.s, p0, [%[c_ptr0]]\n"
+ "fmin z20.s, p7/m, z20.s, z15.s\n"
+ "fmax z21.s, p7/m, z21.s, z14.s\n"
+ "fmax z22.s, p7/m, z22.s, z14.s\n"
+ "st1w z1.s, p0, [c_ptr1]\n"
+ "fmax z23.s, p7/m, z23.s, z14.s\n"
+ "fmax z24.s, p7/m, z24.s, z14.s\n"
+ "fmin z21.s, p7/m, z21.s, z15.s\n"
+ "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "fmin z22.s, p7/m, z22.s, z15.s\n"
+ "addvl %[c_ptr0], %[c_ptr0], #2\n"
+ "fmin z23.s, p7/m, z23.s, z15.s\n"
+ "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "uzp1 z4.s, z20.s, z21.s\n"
+ "uzp2 z5.s, z20.s, z21.s\n"
+ "fmin z24.s, p7/m, z24.s, z15.s\n"
+ "uzp1 z6.s, z22.s, z23.s\n"
+ "st1w z4.s, p0, [c_ptr2]\n"
+ "uzp2 z7.s, z22.s, z23.s\n"
+ "fmax z25.s, p7/m, z25.s, z14.s\n"
+ "fmax z26.s, p7/m, z26.s, z14.s\n"
+ "st1w z5.s, p0, [c_ptr3]\n"
+ "fmax z27.s, p7/m, z27.s, z14.s\n"
+ "fmin z25.s, p7/m, z25.s, z15.s\n"
+ "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "fmin z26.s, p7/m, z26.s, z15.s\n"
+ "fmin z27.s, p7/m, z27.s, z15.s\n"
+ "uzp1 z8.s, z24.s, z25.s\n"
+ "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "uzp2 z9.s, z24.s, z25.s\n"
+ "uzp1 z10.s, z26.s, z27.s\n"
+ "uzp2 z11.s, z26.s, z27.s\n"
+ "st1w z8.s, p0, [c_ptr4]\n"
+ "st1w z9.s, p0, [c_ptr5]\n"
+ "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
+ "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq a_ptr4\n"
+ ".unreq a_ptr5\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq c_ptr4\n"
+ ".unreq c_ptr5\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+ );
+ break;
+ case 7:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "a_ptr4 .req X3\n"
+ "a_ptr5 .req X4\n"
+ "a_ptr6 .req X5\n"
+ "c_ptr1 .req X6\n"
+ "c_ptr2 .req X7\n"
+ "c_ptr3 .req X8\n"
+ "c_ptr4 .req X9\n"
+ "c_ptr5 .req X10\n"
+ "c_ptr6 .req X11\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "add a_ptr4, a_ptr3, %[lda]\n"
+ "add c_ptr4, c_ptr3, %[ldc]\n"
+ "add a_ptr5, a_ptr4, %[lda]\n"
+ "add c_ptr5, c_ptr4, %[ldc]\n"
+ "add a_ptr6, a_ptr5, %[lda]\n"
+ "add c_ptr6, c_ptr5, %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "cbnz %[append], 1f\n"
+ "mov z7.s, #0\n"
+ "ld1w z15.s, p0/z, [%[biasptr]]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z15.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "zip2 z17.s, z15.s, z15.s\n"
+ "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z18.s, z15.s, z15.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr5]\n"
+ "zip2 z19.s, z15.s, z15.s\n"
+ "ld1rqw z6.s, p7/z, [a_ptr6]\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "mov z20.d, z16.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "trn1 z11.d, z6.d, z7.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z21.d, z17.d\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z22.d, z18.d\n"
+ "add a_ptr4, a_ptr4, #0x10\n"
+ "mov z23.d, z19.d\n"
+ "add a_ptr5, a_ptr5, #0x10\n"
+ "mov z24.d, z16.d\n"
+ "add a_ptr6, a_ptr6, #0x10\n"
+ "mov z25.d, z17.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "mov z26.d, z18.d\n"
+ "mov z27.d, z19.d\n"
+ "mov z28.d, z16.d\n"
+ "mov z29.d, z17.d\n"
+ "mov z30.d, z18.d\n"
+ "mov z31.d, z19.d\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "mov z7.s, #0\n"
+ "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z14.s, p0/z, [c_ptr1]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z13.s, z14.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "zip2 z17.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "zip1 z18.s, z13.s, z14.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr5]\n"
+ "zip2 z19.s, z13.s, z14.s\n"
+ "ld1w z13.s, p0/z, [c_ptr2]\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "ld1w z14.s, p0/z, [c_ptr3]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr6]\n"
+ "add a_ptr4, a_ptr4, #0x10\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "zip1 z20.s, z13.s, z14.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "zip2 z21.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "trn1 z11.d, z6.d, z7.d\n"
+ "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "add a_ptr5, a_ptr5, #0x10\n"
+ "add a_ptr6, a_ptr6, #0x10\n"
+ "zip1 z22.s, z13.s, z14.s\n"
+ "zip2 z23.s, z13.s, z14.s\n"
+ "ld1w z13.s, p0/z, [c_ptr4]\n"
+ "ld1w z14.s, p0/z, [c_ptr5]\n"
+ "zip1 z24.s, z13.s, z14.s\n"
+ "zip2 z25.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
+ "zip1 z26.s, z13.s, z14.s\n"
+ "zip2 z27.s, z13.s, z14.s\n"
+ "ld1w z13.s, p0/z, [c_ptr6]\n"
+ "mov z14.s, #0\n"
+ "zip1 z28.s, z13.s, z14.s\n"
+ "zip2 z29.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
+ "mov z14.s, #0\n"
+ "zip1 z30.s, z13.s, z14.s\n"
+ "zip2 z31.s, z13.s, z14.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "trn2 z3.d, z6.d, z7.d\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1rqw z9.s, p7/z, [a_ptr5]\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ "add a_ptr4, a_ptr4, #0x20\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ "add a_ptr5, a_ptr5, #0x20\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1rqw z10.s, p7/z, [a_ptr6]\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z11.s, #0\n"
+ "add a_ptr6, a_ptr6, #0x20\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "trn1 z2.d, z8.d, z9.d\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z3.d, z10.d, z11.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "trn2 z11.d, z10.d, z11.d\n"
+ "trn2 z10.d, z8.d, z9.d\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "mov z7.s, #0\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "trn1 z11.d, z6.d, z7.d\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "trn2 z3.d, z6.d, z7.d\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1rqw z9.s, p7/z, [a_ptr5]\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1rqw z10.s, p7/z, [a_ptr6]\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z11.s, #0\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "trn1 z2.d, z8.d, z9.d\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z3.d, z10.d, z11.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "trn2 z11.d, z10.d, z11.d\n"
+ "trn2 z10.d, z8.d, z9.d\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "addvl a_ptr4, a_ptr4, #2\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "addvl a_ptr5, a_ptr5, #2\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #2\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ "addvl a_ptr6, a_ptr6, #2\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ "addvl a_ptr1, a_ptr1, #2\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "mov z7.s, #0\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "addvl a_ptr2, a_ptr2, #2\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl a_ptr3, a_ptr3, #2\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "trn1 z11.d, z6.d, z7.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "trn2 z3.d, z6.d, z7.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "b 5f\n"
+ "4:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "trn2 z3.d, z6.d, z7.d\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1rqw z8.s, p6/z, [a_ptr4]\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "addvl a_ptr3, a_ptr3, #1\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1rqw z9.s, p6/z, [a_ptr5]\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ "addvl a_ptr4, a_ptr4, #1\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ "addvl a_ptr5, a_ptr5, #1\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1rqw z10.s, p6/z, [a_ptr6]\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z11.s, #0\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "addvl a_ptr6, a_ptr6, #1\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "trn1 z2.d, z8.d, z9.d\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "trn1 z3.d, z10.d, z11.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z11.d, z10.d, z11.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn2 z10.d, z8.d, z9.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "5:\n"
+ "ld1rw z14.s, p7/z, [%[minptr]]\n"
+ "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+ "fmax z16.s, p7/m, z16.s, z14.s\n"
+ "fmax z17.s, p7/m, z17.s, z14.s\n"
+ "fmax z18.s, p7/m, z18.s, z14.s\n"
+ "fmax z19.s, p7/m, z19.s, z14.s\n"
+ "fmin z16.s, p7/m, z16.s, z15.s\n"
+ "fmin z17.s, p7/m, z17.s, z15.s\n"
+ "fmin z18.s, p7/m, z18.s, z15.s\n"
+ "fmin z19.s, p7/m, z19.s, z15.s\n"
+ "fmax z20.s, p7/m, z20.s, z14.s\n"
+ "uzp1 z0.s, z16.s, z17.s\n"
+ "uzp2 z1.s, z16.s, z17.s\n"
+ "uzp1 z2.s, z18.s, z19.s\n"
+ "uzp2 z3.s, z18.s, z19.s\n"
+ "st1w z0.s, p0, [%[c_ptr0]]\n"
+ "fmin z20.s, p7/m, z20.s, z15.s\n"
+ "fmax z21.s, p7/m, z21.s, z14.s\n"
+ "fmax z22.s, p7/m, z22.s, z14.s\n"
+ "st1w z1.s, p0, [c_ptr1]\n"
+ "fmax z23.s, p7/m, z23.s, z14.s\n"
+ "fmax z24.s, p7/m, z24.s, z14.s\n"
+ "fmin z21.s, p7/m, z21.s, z15.s\n"
+ "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "fmin z22.s, p7/m, z22.s, z15.s\n"
+ "addvl %[c_ptr0], %[c_ptr0], #2\n"
+ "fmin z23.s, p7/m, z23.s, z15.s\n"
+ "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "uzp1 z4.s, z20.s, z21.s\n"
+ "uzp2 z5.s, z20.s, z21.s\n"
+ "fmin z24.s, p7/m, z24.s, z15.s\n"
+ "uzp1 z6.s, z22.s, z23.s\n"
+ "st1w z4.s, p0, [c_ptr2]\n"
+ "uzp2 z7.s, z22.s, z23.s\n"
+ "fmax z25.s, p7/m, z25.s, z14.s\n"
+ "fmax z26.s, p7/m, z26.s, z14.s\n"
+ "st1w z5.s, p0, [c_ptr3]\n"
+ "fmax z27.s, p7/m, z27.s, z14.s\n"
+ "fmax z28.s, p7/m, z28.s, z14.s\n"
+ "fmin z25.s, p7/m, z25.s, z15.s\n"
+ "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "fmin z26.s, p7/m, z26.s, z15.s\n"
+ "fmin z27.s, p7/m, z27.s, z15.s\n"
+ "fmin z28.s, p7/m, z28.s, z15.s\n"
+ "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "uzp1 z8.s, z24.s, z25.s\n"
+ "uzp2 z9.s, z24.s, z25.s\n"
+ "uzp1 z10.s, z26.s, z27.s\n"
+ "uzp2 z11.s, z26.s, z27.s\n"
+ "st1w z8.s, p0, [c_ptr4]\n"
+ "fmax z29.s, p7/m, z29.s, z14.s\n"
+ "fmax z30.s, p7/m, z30.s, z14.s\n"
+ "fmax z31.s, p7/m, z31.s, z14.s\n"
+ "st1w z9.s, p0, [c_ptr5]\n"
+ "fmin z29.s, p7/m, z29.s, z15.s\n"
+ "fmin z30.s, p7/m, z30.s, z15.s\n"
+ "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
+ "fmin z31.s, p7/m, z31.s, z15.s\n"
+ "uzp1 z12.s, z28.s, z29.s\n"
+ "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
+ "uzp1 z13.s, z30.s, z31.s\n"
+ "st1w z12.s, p0, [c_ptr6]\n"
+ "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq a_ptr4\n"
+ ".unreq a_ptr5\n"
+ ".unreq a_ptr6\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq c_ptr4\n"
+ ".unreq c_ptr5\n"
+ ".unreq c_ptr6\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
+ );
+ break;
+ default:
+ case 8:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "a_ptr4 .req X3\n"
+ "a_ptr5 .req X4\n"
+ "a_ptr6 .req X5\n"
+ "a_ptr7 .req X6\n"
+ "c_ptr1 .req X7\n"
+ "c_ptr2 .req X8\n"
+ "c_ptr3 .req X9\n"
+ "c_ptr4 .req X10\n"
+ "c_ptr5 .req X11\n"
+ "c_ptr6 .req X12\n"
+ "c_ptr7 .req X13\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "add a_ptr4, a_ptr3, %[lda]\n"
+ "add c_ptr4, c_ptr3, %[ldc]\n"
+ "add a_ptr5, a_ptr4, %[lda]\n"
+ "add c_ptr5, c_ptr4, %[ldc]\n"
+ "add a_ptr6, a_ptr5, %[lda]\n"
+ "add c_ptr6, c_ptr5, %[ldc]\n"
+ "add a_ptr7, a_ptr6, %[lda]\n"
+ "add c_ptr7, c_ptr6, %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "cbnz %[append], 1f\n"
+ "ld1w z15.s, p0/z, [%[biasptr]]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z15.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "zip2 z17.s, z15.s, z15.s\n"
+ "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z18.s, z15.s, z15.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr5]\n"
+ "zip2 z19.s, z15.s, z15.s\n"
+ "ld1rqw z6.s, p7/z, [a_ptr6]\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "ld1rqw z7.s, p7/z, [a_ptr7]\n"
+ "mov z20.d, z16.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.d, z17.d\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "trn1 z11.d, z6.d, z7.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.d, z18.d\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z23.d, z19.d\n"
+ "add a_ptr4, a_ptr4, #0x10\n"
+ "mov z24.d, z16.d\n"
+ "add a_ptr5, a_ptr5, #0x10\n"
+ "mov z25.d, z17.d\n"
+ "add a_ptr6, a_ptr6, #0x10\n"
+ "mov z26.d, z18.d\n"
+ "add a_ptr7, a_ptr7, #0x10\n"
+ "mov z27.d, z19.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "mov z28.d, z16.d\n"
+ "mov z29.d, z17.d\n"
+ "mov z30.d, z18.d\n"
+ "mov z31.d, z19.d\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z14.s, p0/z, [c_ptr1]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip1 z16.s, z13.s, z14.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "zip2 z17.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "zip1 z18.s, z13.s, z14.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr5]\n"
+ "zip2 z19.s, z13.s, z14.s\n"
+ "ld1w z13.s, p0/z, [c_ptr2]\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ "ld1w z14.s, p0/z, [c_ptr3]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr6]\n"
+ "add a_ptr4, a_ptr4, #0x10\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ "ld1rqw z7.s, p7/z, [a_ptr7]\n"
+ "zip1 z20.s, z13.s, z14.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "zip2 z21.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "add a_ptr5, a_ptr5, #0x10\n"
+ "trn1 z11.d, z6.d, z7.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr6, a_ptr6, #0x10\n"
+ "zip1 z22.s, z13.s, z14.s\n"
+ "add a_ptr7, a_ptr7, #0x10\n"
+ "zip2 z23.s, z13.s, z14.s\n"
+ "ld1w z13.s, p0/z, [c_ptr4]\n"
+ "ld1w z14.s, p0/z, [c_ptr5]\n"
+ "zip1 z24.s, z13.s, z14.s\n"
+ "zip2 z25.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
+ "zip1 z26.s, z13.s, z14.s\n"
+ "zip2 z27.s, z13.s, z14.s\n"
+ "ld1w z13.s, p0/z, [c_ptr6]\n"
+ "ld1w z14.s, p0/z, [c_ptr7]\n"
+ "zip1 z28.s, z13.s, z14.s\n"
+ "zip2 z29.s, z13.s, z14.s\n"
+ "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
+ "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n"
+ "zip1 z30.s, z13.s, z14.s\n"
+ "zip2 z31.s, z13.s, z14.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "trn2 z3.d, z6.d, z7.d\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1rqw z9.s, p7/z, [a_ptr5]\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ "add a_ptr4, a_ptr4, #0x20\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ "add a_ptr5, a_ptr5, #0x20\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1rqw z10.s, p7/z, [a_ptr6]\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1rqw z11.s, p7/z, [a_ptr7]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "add a_ptr6, a_ptr6, #0x20\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "add a_ptr7, a_ptr7, #0x20\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "trn1 z2.d, z8.d, z9.d\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z3.d, z10.d, z11.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "trn2 z11.d, z10.d, z11.d\n"
+ "trn2 z10.d, z8.d, z9.d\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "ld1rqw z7.s, p7/z, [a_ptr7, #-0x10]\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "trn1 z11.d, z6.d, z7.d\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "trn2 z3.d, z6.d, z7.d\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1rqw z9.s, p7/z, [a_ptr5]\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1rqw z10.s, p7/z, [a_ptr6]\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1rqw z11.s, p7/z, [a_ptr7]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "trn1 z2.d, z8.d, z9.d\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "trn1 z3.d, z10.d, z11.d\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "trn2 z11.d, z10.d, z11.d\n"
+ "trn2 z10.d, z8.d, z9.d\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "addvl a_ptr4, a_ptr4, #2\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #2\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ "addvl a_ptr5, a_ptr5, #2\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ "addvl a_ptr6, a_ptr6, #2\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ "addvl a_ptr1, a_ptr1, #2\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ "addvl a_ptr7, a_ptr7, #2\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl a_ptr2, a_ptr2, #2\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "addvl a_ptr3, a_ptr3, #2\n"
+ "trn1 z8.d, z0.d, z1.d\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "trn1 z9.d, z2.d, z3.d\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "trn1 z10.d, z4.d, z5.d\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "trn1 z11.d, z6.d, z7.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "trn2 z3.d, z6.d, z7.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "b 5f\n"
+ "4:\n"
+ "trn2 z0.d, z0.d, z1.d\n"
+ "trn2 z1.d, z2.d, z3.d\n"
+ "trn2 z2.d, z4.d, z5.d\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "trn2 z3.d, z6.d, z7.d\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ "ld1rqw z8.s, p6/z, [a_ptr4]\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ "addvl a_ptr3, a_ptr3, #1\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ "ld1rqw z9.s, p6/z, [a_ptr5]\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ "addvl a_ptr4, a_ptr4, #1\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ "addvl a_ptr5, a_ptr5, #1\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ "ld1rqw z10.s, p6/z, [a_ptr6]\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "ld1rqw z11.s, p6/z, [a_ptr7]\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ "addvl a_ptr6, a_ptr6, #1\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ "addvl a_ptr7, a_ptr7, #1\n"
+ "trn1 z0.d, z4.d, z5.d\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ "trn1 z1.d, z6.d, z7.d\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ "trn1 z2.d, z8.d, z9.d\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "trn1 z3.d, z10.d, z11.d\n"
+ "cbz %[blocks], 5f\n"
+ "trn2 z11.d, z10.d, z11.d\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+ "trn2 z10.d, z8.d, z9.d\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "trn2 z9.d, z6.d, z7.d\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "trn2 z8.d, z4.d, z5.d\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+ ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+ ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+ ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+ ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+ ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+ ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+ ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+ ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+ ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+ ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+ ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+ ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+ ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+ ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+ "b.eq 5f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+ ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+ ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+ ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+ ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+ ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+ ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+ ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+ ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+ ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+ ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+ ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+ ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+ ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+ ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+ ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+ "5:\n"
+ "ld1rw z14.s, p7/z, [%[minptr]]\n"
+ "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+ "fmax z16.s, p7/m, z16.s, z14.s\n"
+ "fmax z17.s, p7/m, z17.s, z14.s\n"
+ "fmax z18.s, p7/m, z18.s, z14.s\n"
+ "fmax z19.s, p7/m, z19.s, z14.s\n"
+ "fmin z16.s, p7/m, z16.s, z15.s\n"
+ "fmin z17.s, p7/m, z17.s, z15.s\n"
+ "fmin z18.s, p7/m, z18.s, z15.s\n"
+ "fmin z19.s, p7/m, z19.s, z15.s\n"
+ "fmax z20.s, p7/m, z20.s, z14.s\n"
+ "uzp1 z0.s, z16.s, z17.s\n"
+ "uzp2 z1.s, z16.s, z17.s\n"
+ "uzp1 z2.s, z18.s, z19.s\n"
+ "uzp2 z3.s, z18.s, z19.s\n"
+ "st1w z0.s, p0, [%[c_ptr0]]\n"
+ "fmin z20.s, p7/m, z20.s, z15.s\n"
+ "fmax z21.s, p7/m, z21.s, z14.s\n"
+ "fmax z22.s, p7/m, z22.s, z14.s\n"
+ "st1w z1.s, p0, [c_ptr1]\n"
+ "fmax z23.s, p7/m, z23.s, z14.s\n"
+ "fmax z24.s, p7/m, z24.s, z14.s\n"
+ "fmin z21.s, p7/m, z21.s, z15.s\n"
+ "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "fmin z22.s, p7/m, z22.s, z15.s\n"
+ "addvl %[c_ptr0], %[c_ptr0], #2\n"
+ "fmin z23.s, p7/m, z23.s, z15.s\n"
+ "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "uzp1 z4.s, z20.s, z21.s\n"
+ "uzp2 z5.s, z20.s, z21.s\n"
+ "fmin z24.s, p7/m, z24.s, z15.s\n"
+ "uzp1 z6.s, z22.s, z23.s\n"
+ "st1w z4.s, p0, [c_ptr2]\n"
+ "uzp2 z7.s, z22.s, z23.s\n"
+ "fmax z25.s, p7/m, z25.s, z14.s\n"
+ "fmax z26.s, p7/m, z26.s, z14.s\n"
+ "st1w z5.s, p0, [c_ptr3]\n"
+ "fmax z27.s, p7/m, z27.s, z14.s\n"
+ "fmax z28.s, p7/m, z28.s, z14.s\n"
+ "fmin z25.s, p7/m, z25.s, z15.s\n"
+ "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "fmin z26.s, p7/m, z26.s, z15.s\n"
+ "fmin z27.s, p7/m, z27.s, z15.s\n"
+ "fmin z28.s, p7/m, z28.s, z15.s\n"
+ "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "uzp1 z8.s, z24.s, z25.s\n"
+ "uzp2 z9.s, z24.s, z25.s\n"
+ "uzp1 z10.s, z26.s, z27.s\n"
+ "uzp2 z11.s, z26.s, z27.s\n"
+ "st1w z8.s, p0, [c_ptr4]\n"
+ "fmax z29.s, p7/m, z29.s, z14.s\n"
+ "fmax z30.s, p7/m, z30.s, z14.s\n"
+ "fmax z31.s, p7/m, z31.s, z14.s\n"
+ "st1w z9.s, p0, [c_ptr5]\n"
+ "fmin z29.s, p7/m, z29.s, z15.s\n"
+ "fmin z30.s, p7/m, z30.s, z15.s\n"
+ "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
+ "fmin z31.s, p7/m, z31.s, z15.s\n"
+ "uzp1 z12.s, z28.s, z29.s\n"
+ "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
+ "uzp2 z13.s, z28.s, z29.s\n"
+ "uzp1 z14.s, z30.s, z31.s\n"
+ "uzp2 z15.s, z30.s, z31.s\n"
+ "st1w z12.s, p0, [c_ptr6]\n"
+ "st1w z13.s, p0, [c_ptr7]\n"
+ "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n"
+ "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq a_ptr4\n"
+ ".unreq a_ptr5\n"
+ ".unreq a_ptr6\n"
+ ".unreq a_ptr7\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq c_ptr4\n"
+ ".unreq c_ptr5\n"
+ ".unreq c_ptr6\n"
+ ".unreq c_ptr7\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
+ );
+ break;
+ }
+
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
index d8422105cc..230a2cf19f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_s8s32_dot_4VLx4;
- hybrid_s8s32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+ hybrid_s8s32_dot_4VLx4(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
index aa3a764dec..46fc500476 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
@@ -32,7 +32,7 @@
namespace arm_gemm {
-void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool append) {
+void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool append) {
const int K_stride = ((K + 3) / 4) * 4;
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
@@ -41,12 +41,23 @@ void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32
const long leftovers = K;
const long blocks_count = (K + 3) / 4;
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const int8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(int8_t);
int32_t *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
long loops = loops_count;
@@ -57,7 +68,7 @@ void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32
const int8_t *b_ptr0 = B + (K_stride * x0);
const unsigned long ldcb = ldc * sizeof(int32_t);
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"whilelt p6.b, %[temp], %[leftovers]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
index 5dab1da135..f829fb0205 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_u8u32_dot_4VLx4;
- hybrid_u8u32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+ hybrid_u8u32_dot_4VLx4(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
index 4fb7e825b5..13614700e3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
@@ -32,7 +32,7 @@
namespace arm_gemm {
-void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool append) {
+void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool append) {
const int K_stride = ((K + 3) / 4) * 4;
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
@@ -41,12 +41,23 @@ void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uin
const long leftovers = K;
const long blocks_count = (K + 3) / 4;
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const uint8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(uint8_t);
uint32_t *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
long loops = loops_count;
@@ -57,7 +68,7 @@ void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uin
const uint8_t *b_ptr0 = B + (K_stride * x0);
const unsigned long ldcb = ldc * sizeof(uint32_t);
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"whilelt p6.b, %[temp], %[leftovers]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp
index a3434c1504..43107e45fa 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,7 +61,10 @@ public:
kern_type kernel=sve_interleaved_bf16fp32_dot_3VLx8;
- interleaved_bf16fp32_dot_3VLx8(const CPUInfo *ci) { UNUSED(ci); }
+ interleaved_bf16fp32_dot_3VLx8(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp
index 65841581aa..7e20ed0971 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,13 +61,11 @@ void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 *
"mov z15.s, #0\n"
"ld1rqh z2.h, p0/z, [%[a_ptr], #0x20]\n"
"mov z16.s, #0\n"
- "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z17.s, #0\n"
- "ld1rqh z3.h, p0/z, [%[a_ptr], #0x30]\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
"mov z18.s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z19.s, #0\n"
- "addvl %[b_ptr], %[b_ptr], #3\n"
"mov z20.s, #0\n"
"mov z21.s, #0\n"
"mov z22.s, #0\n"
@@ -83,9 +81,11 @@ void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 *
"cbz %[loops], 1f\n"
"2:\n"
".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
+ "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
+ "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n"
@@ -141,13 +141,13 @@ void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 *
".inst 0x646b40dd // bfdot z29.s, z6.h, z3.h[1]\n"
".inst 0x647340de // bfdot z30.s, z6.h, z3.h[2]\n"
".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n"
- "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
- "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
"b.ne 2b\n"
"1:\n"
"cbz %[tails], 3f\n"
".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
+ "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
+ "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
@@ -235,9 +235,11 @@ void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 *
"b 4f\n"
"3:\n"
".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
- "addvl %[b_ptr], %[b_ptr], #3\n"
+ "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
+ "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp
index c6ffc047fd..f1353e2086 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,7 +61,10 @@ public:
kern_type kernel=sve_interleaved_bf16fp32_mmla_3VLx8;
- interleaved_bf16fp32_mmla_3VLx8(const CPUInfo *ci) { UNUSED(ci); }
+ interleaved_bf16fp32_mmla_3VLx8(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp
index 528fc72005..16cc69b2a6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -63,13 +63,11 @@ void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16
"mov z16.s, #0\n"
"ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
"mov z17.s, #0\n"
- "ld1rqh z3.h, p0/z, [%[a_ptr], #0x30]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z18.s, #0\n"
- "ld1h z7.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
+ "addvl %[b_ptr], %[b_ptr], #4\n"
"mov z19.s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z20.s, #0\n"
- "addvl %[b_ptr], %[b_ptr], #4\n"
"mov z21.s, #0\n"
"mov z22.s, #0\n"
"mov z23.s, #0\n"
@@ -84,12 +82,14 @@ void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16
"cbz %[loops], 1f\n"
"2:\n"
".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- "subs %[loops], %[loops], #0x1\n"
+ "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
+ "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
"ld1h z4.h, p0/z, [%[b_ptr]]\n"
- ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
@@ -152,18 +152,18 @@ void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16
".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
"ld1rqh z2.h, p0/z, [%[a_ptr], #-0x20]\n"
".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n"
- "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
- "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
"b.ne 2b\n"
"1:\n"
"cbz %[tails], 3f\n"
".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
+ "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
+ "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
- ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
- "ld1h z4.h, p0/z, [%[b_ptr]]\n"
".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
+ ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
+ "ld1h z4.h, p0/z, [%[b_ptr]]\n"
".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
"ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
@@ -269,15 +269,17 @@ void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16
"b 4f\n"
"3:\n"
".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
- "addvl %[b_ptr], %[b_ptr], #8\n"
+ "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
- ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
- "ld1h z4.h, p0/z, [%[b_ptr], #-8, MUL VL]\n"
+ "addvl %[b_ptr], %[b_ptr], #8\n"
+ ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
+ "ld1h z4.h, p0/z, [%[b_ptr], #-8, MUL VL]\n"
".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
"ld1h z5.h, p0/z, [%[b_ptr], #-7, MUL VL]\n"
".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
index 10dbdd8847..816c0cd095 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,7 +61,10 @@ public:
kern_type kernel=sve_interleaved_fp16_mla_3VLx8;
- interleaved_fp16_mla_3VLx8(const CPUInfo *ci) { UNUSED(ci); }
+ interleaved_fp16_mla_3VLx8(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
index b2d3a6f52e..f2050cbd56 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,22 +50,22 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel,
"mov z9.h, #0\n"
"mov z10.h, #0\n"
"mov z11.h, #0\n"
- "mov z12.h, #0\n"
"ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
- "mov z13.h, #0\n"
+ "mov z12.h, #0\n"
"ld1h z2.h, p0/z, [%[b_ptr]]\n"
- "mov z14.h, #0\n"
+ "mov z13.h, #0\n"
"ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
- "mov z15.h, #0\n"
+ "mov z14.h, #0\n"
"ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
- "mov z16.h, #0\n"
+ "mov z15.h, #0\n"
"ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
- "mov z17.h, #0\n"
+ "mov z16.h, #0\n"
"ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
- "mov z18.h, #0\n"
+ "mov z17.h, #0\n"
"add %[a_ptr], %[a_ptr], #0x20\n"
- "mov z19.h, #0\n"
+ "mov z18.h, #0\n"
"addvl %[b_ptr], %[b_ptr], #6\n"
+ "mov z19.h, #0\n"
"mov z20.h, #0\n"
"mov z21.h, #0\n"
"mov z22.h, #0\n"
@@ -202,8 +202,8 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel,
"fmla z9.h, z2.h, z0.h[1]\n"
"fmla z10.h, z2.h, z0.h[2]\n"
"fmla z11.h, z2.h, z0.h[3]\n"
- "fmla z12.h, z2.h, z0.h[4]\n"
"st1h z8.h, p0, [%[c_ptr]]\n"
+ "fmla z12.h, z2.h, z0.h[4]\n"
"fmla z13.h, z2.h, z0.h[5]\n"
"fmla z14.h, z2.h, z0.h[6]\n"
"fmla z15.h, z2.h, z0.h[7]\n"
@@ -211,8 +211,8 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel,
"fmla z17.h, z3.h, z0.h[1]\n"
"fmla z18.h, z3.h, z0.h[2]\n"
"fmla z19.h, z3.h, z0.h[3]\n"
- "fmla z20.h, z3.h, z0.h[4]\n"
"st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
+ "fmla z20.h, z3.h, z0.h[4]\n"
"fmla z21.h, z3.h, z0.h[5]\n"
"fmla z22.h, z3.h, z0.h[6]\n"
"fmla z23.h, z3.h, z0.h[7]\n"
@@ -220,10 +220,11 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel,
"fmla z25.h, z4.h, z0.h[1]\n"
"fmla z26.h, z4.h, z0.h[2]\n"
"fmla z27.h, z4.h, z0.h[3]\n"
- "fmla z28.h, z4.h, z0.h[4]\n"
"st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
+ "fmla z28.h, z4.h, z0.h[4]\n"
"fmla z29.h, z4.h, z0.h[5]\n"
"fmla z30.h, z4.h, z0.h[6]\n"
+ "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z31.h, z4.h, z0.h[7]\n"
"b 4f\n"
"3:\n"
@@ -257,8 +258,8 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel,
"fmla z9.h, z5.h, z1.h[1]\n"
"fmla z10.h, z5.h, z1.h[2]\n"
"fmla z11.h, z5.h, z1.h[3]\n"
- "fmla z12.h, z5.h, z1.h[4]\n"
"st1h z8.h, p0, [%[c_ptr]]\n"
+ "fmla z12.h, z5.h, z1.h[4]\n"
"fmla z13.h, z5.h, z1.h[5]\n"
"fmla z14.h, z5.h, z1.h[6]\n"
"fmla z15.h, z5.h, z1.h[7]\n"
@@ -266,8 +267,8 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel,
"fmla z17.h, z6.h, z1.h[1]\n"
"fmla z18.h, z6.h, z1.h[2]\n"
"fmla z19.h, z6.h, z1.h[3]\n"
- "fmla z20.h, z6.h, z1.h[4]\n"
"st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
+ "fmla z20.h, z6.h, z1.h[4]\n"
"fmla z21.h, z6.h, z1.h[5]\n"
"fmla z22.h, z6.h, z1.h[6]\n"
"fmla z23.h, z6.h, z1.h[7]\n"
@@ -275,13 +276,13 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel,
"fmla z25.h, z7.h, z1.h[1]\n"
"fmla z26.h, z7.h, z1.h[2]\n"
"fmla z27.h, z7.h, z1.h[3]\n"
- "fmla z28.h, z7.h, z1.h[4]\n"
"st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
+ "fmla z28.h, z7.h, z1.h[4]\n"
"fmla z29.h, z7.h, z1.h[5]\n"
"fmla z30.h, z7.h, z1.h[6]\n"
+ "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z31.h, z7.h, z1.h[7]\n"
"4:\n"
- "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
"st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
"st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
"st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
index cdc9447701..cce90fb135 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,7 +61,10 @@ public:
kern_type kernel=sve_interleaved_fp32_mla_3VLx8;
- interleaved_fp32_mla_3VLx8(const CPUInfo *ci) { UNUSED(ci); }
+ interleaved_fp32_mla_3VLx8(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
index d26948a0d4..cd178c478a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,20 +50,20 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl
"mov z9.s, #0\n"
"mov z10.s, #0\n"
"mov z11.s, #0\n"
- "mov z12.s, #0\n"
"ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
- "mov z13.s, #0\n"
+ "mov z12.s, #0\n"
"ld1w z4.s, p0/z, [%[b_ptr]]\n"
- "mov z14.s, #0\n"
+ "mov z13.s, #0\n"
"ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
- "mov z15.s, #0\n"
+ "mov z14.s, #0\n"
"ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
- "mov z16.s, #0\n"
+ "mov z15.s, #0\n"
"ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
- "mov z17.s, #0\n"
+ "mov z16.s, #0\n"
"add %[a_ptr], %[a_ptr], #0x40\n"
- "mov z18.s, #0\n"
+ "mov z17.s, #0\n"
"addvl %[b_ptr], %[b_ptr], #3\n"
+ "mov z18.s, #0\n"
"mov z19.s, #0\n"
"mov z20.s, #0\n"
"mov z21.s, #0\n"
@@ -207,8 +207,8 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl
"fmla z9.s, z4.s, z0.s[1]\n"
"fmla z10.s, z4.s, z0.s[2]\n"
"fmla z11.s, z4.s, z0.s[3]\n"
- "fmla z20.s, z4.s, z1.s[0]\n"
"st1w z8.s, p0, [%[c_ptr]]\n"
+ "fmla z20.s, z4.s, z1.s[0]\n"
"fmla z21.s, z4.s, z1.s[1]\n"
"fmla z22.s, z4.s, z1.s[2]\n"
"fmla z23.s, z4.s, z1.s[3]\n"
@@ -216,8 +216,8 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl
"fmla z13.s, z5.s, z0.s[1]\n"
"fmla z14.s, z5.s, z0.s[2]\n"
"fmla z15.s, z5.s, z0.s[3]\n"
- "fmla z24.s, z5.s, z1.s[0]\n"
"st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z5.s, z1.s[0]\n"
"fmla z25.s, z5.s, z1.s[1]\n"
"fmla z26.s, z5.s, z1.s[2]\n"
"fmla z27.s, z5.s, z1.s[3]\n"
@@ -225,10 +225,11 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl
"fmla z17.s, z6.s, z0.s[1]\n"
"fmla z18.s, z6.s, z0.s[2]\n"
"fmla z19.s, z6.s, z0.s[3]\n"
- "fmla z28.s, z6.s, z1.s[0]\n"
"st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "fmla z28.s, z6.s, z1.s[0]\n"
"fmla z29.s, z6.s, z1.s[1]\n"
"fmla z30.s, z6.s, z1.s[2]\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z31.s, z6.s, z1.s[3]\n"
"b 4f\n"
"3:\n"
@@ -266,8 +267,8 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl
"fmla z9.s, z4.s, z2.s[1]\n"
"fmla z10.s, z4.s, z2.s[2]\n"
"fmla z11.s, z4.s, z2.s[3]\n"
- "fmla z20.s, z4.s, z3.s[0]\n"
"st1w z8.s, p0, [%[c_ptr]]\n"
+ "fmla z20.s, z4.s, z3.s[0]\n"
"fmla z21.s, z4.s, z3.s[1]\n"
"fmla z22.s, z4.s, z3.s[2]\n"
"fmla z23.s, z4.s, z3.s[3]\n"
@@ -275,8 +276,8 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl
"fmla z13.s, z5.s, z2.s[1]\n"
"fmla z14.s, z5.s, z2.s[2]\n"
"fmla z15.s, z5.s, z2.s[3]\n"
- "fmla z24.s, z5.s, z3.s[0]\n"
"st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z5.s, z3.s[0]\n"
"fmla z25.s, z5.s, z3.s[1]\n"
"fmla z26.s, z5.s, z3.s[2]\n"
"fmla z27.s, z5.s, z3.s[3]\n"
@@ -284,13 +285,13 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl
"fmla z17.s, z6.s, z2.s[1]\n"
"fmla z18.s, z6.s, z2.s[2]\n"
"fmla z19.s, z6.s, z2.s[3]\n"
- "fmla z28.s, z6.s, z3.s[0]\n"
"st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "fmla z28.s, z6.s, z3.s[0]\n"
"fmla z29.s, z6.s, z3.s[1]\n"
"fmla z30.s, z6.s, z3.s[2]\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z31.s, z6.s, z3.s[3]\n"
"4:\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp
new file mode 100644
index 0000000000..4ca43cd5c9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm {
+
+// Actual kernel implementations
+void sve_interleaved_fp32_mmla_3VLx8(const float *, const float *, float *, int, int, int);
+
+class interleaved_fp32_mmla_3VLx8 {
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_width()
+ {
+ return get_vector_length<float>() * 3;
+ }
+
+ static unsigned int out_height()
+ {
+ return 8;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 2;
+ }
+
+ // Use the standard fixed size transforms.
+ StdTransformsSVE<operand_type, result_type, 8, 6, 2, 2> transforms = {};
+
+ kern_type kernel=sve_interleaved_fp32_mmla_3VLx8;
+
+ interleaved_fp32_mmla_3VLx8(const CPUInfo *)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp
new file mode 100644
index 0000000000..a404ae9c82
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void sve_interleaved_fp32_mmla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ K /= 2;
+ const long loops_count = (K / 2) - 1;
+ const long tails_count = K % 2;
+
+ for (int yb=0; yb<ablocks; yb++) {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ long loops = loops_count;
+ long tails = tails_count;
+
+ __asm __volatile (
+ "mov z8.s, #0\n"
+ "ptrue p0.s\n"
+ "mov z9.s, #0\n"
+ "mov z10.s, #0\n"
+ "mov z11.s, #0\n"
+ "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
+ "mov z12.s, #0\n"
+ "ld1w z4.s, p0/z, [%[b_ptr]]\n"
+ "mov z13.s, #0\n"
+ "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
+ "mov z14.s, #0\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "mov z15.s, #0\n"
+ "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
+ "mov z16.s, #0\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "mov z17.s, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ "mov z18.s, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #4\n"
+ "mov z19.s, #0\n"
+ "mov z20.s, #0\n"
+ "mov z21.s, #0\n"
+ "mov z22.s, #0\n"
+ "mov z23.s, #0\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ ".inst 0x64a4e408 // fmmla z8.s, z0.s, z4.s\n"
+ "ld1w z7.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ ".inst 0x64a4e42e // fmmla z14.s, z1.s, z4.s\n"
+ "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+ ".inst 0x64a4e454 // fmmla z20.s, z2.s, z4.s\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x64a5e409 // fmmla z9.s, z0.s, z5.s\n"
+ ".inst 0x64a4e47a // fmmla z26.s, z3.s, z4.s\n"
+ "ld1w z4.s, p0/z, [%[b_ptr]]\n"
+ ".inst 0x64a5e42f // fmmla z15.s, z1.s, z5.s\n"
+ ".inst 0x64a5e455 // fmmla z21.s, z2.s, z5.s\n"
+ ".inst 0x64a5e47b // fmmla z27.s, z3.s, z5.s\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ ".inst 0x64a6e40a // fmmla z10.s, z0.s, z6.s\n"
+ ".inst 0x64a6e430 // fmmla z16.s, z1.s, z6.s\n"
+ ".inst 0x64a6e456 // fmmla z22.s, z2.s, z6.s\n"
+ ".inst 0x64a6e47c // fmmla z28.s, z3.s, z6.s\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ ".inst 0x64a7e40b // fmmla z11.s, z0.s, z7.s\n"
+ ".inst 0x64a7e431 // fmmla z17.s, z1.s, z7.s\n"
+ ".inst 0x64a7e457 // fmmla z23.s, z2.s, z7.s\n"
+ ".inst 0x64a7e47d // fmmla z29.s, z3.s, z7.s\n"
+ "ld1w z7.s, p0/z, [%[b_ptr], #3, MUL VL]\n"
+ ".inst 0x64a4e40c // fmmla z12.s, z0.s, z4.s\n"
+ ".inst 0x64a4e432 // fmmla z18.s, z1.s, z4.s\n"
+ ".inst 0x64a4e458 // fmmla z24.s, z2.s, z4.s\n"
+ ".inst 0x64a4e47e // fmmla z30.s, z3.s, z4.s\n"
+ "ld1w z4.s, p0/z, [%[b_ptr], #4, MUL VL]\n"
+ ".inst 0x64a5e40d // fmmla z13.s, z0.s, z5.s\n"
+ "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
+ ".inst 0x64a5e433 // fmmla z19.s, z1.s, z5.s\n"
+ "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
+ ".inst 0x64a5e459 // fmmla z25.s, z2.s, z5.s\n"
+ "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
+ ".inst 0x64a5e47f // fmmla z31.s, z3.s, z5.s\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #5, MUL VL]\n"
+ ".inst 0x64a6e408 // fmmla z8.s, z0.s, z6.s\n"
+ "ld1rqw z3.s, p0/z, [%[a_ptr], #0x30]\n"
+ ".inst 0x64a6e42e // fmmla z14.s, z1.s, z6.s\n"
+ "add %[a_ptr], %[a_ptr], #0x80\n"
+ ".inst 0x64a6e454 // fmmla z20.s, z2.s, z6.s\n"
+ "addvl %[b_ptr], %[b_ptr], #12\n"
+ ".inst 0x64a6e47a // fmmla z26.s, z3.s, z6.s\n"
+ ".inst 0x64a7e409 // fmmla z9.s, z0.s, z7.s\n"
+ ".inst 0x64a7e42f // fmmla z15.s, z1.s, z7.s\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #-6, MUL VL]\n"
+ ".inst 0x64a7e455 // fmmla z21.s, z2.s, z7.s\n"
+ ".inst 0x64a7e47b // fmmla z27.s, z3.s, z7.s\n"
+ "ld1w z7.s, p0/z, [%[b_ptr], #-5, MUL VL]\n"
+ ".inst 0x64a4e40a // fmmla z10.s, z0.s, z4.s\n"
+ ".inst 0x64a4e430 // fmmla z16.s, z1.s, z4.s\n"
+ ".inst 0x64a4e456 // fmmla z22.s, z2.s, z4.s\n"
+ ".inst 0x64a4e47c // fmmla z28.s, z3.s, z4.s\n"
+ "ld1w z4.s, p0/z, [%[b_ptr], #-4, MUL VL]\n"
+ ".inst 0x64a5e40b // fmmla z11.s, z0.s, z5.s\n"
+ ".inst 0x64a5e431 // fmmla z17.s, z1.s, z5.s\n"
+ ".inst 0x64a5e457 // fmmla z23.s, z2.s, z5.s\n"
+ ".inst 0x64a5e47d // fmmla z29.s, z3.s, z5.s\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+ ".inst 0x64a6e40c // fmmla z12.s, z0.s, z6.s\n"
+ ".inst 0x64a6e432 // fmmla z18.s, z1.s, z6.s\n"
+ ".inst 0x64a6e458 // fmmla z24.s, z2.s, z6.s\n"
+ ".inst 0x64a6e47e // fmmla z30.s, z3.s, z6.s\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+ ".inst 0x64a7e40d // fmmla z13.s, z0.s, z7.s\n"
+ "ld1rqw z0.s, p0/z, [%[a_ptr], #-0x40]\n"
+ ".inst 0x64a7e433 // fmmla z19.s, z1.s, z7.s\n"
+ "ld1rqw z1.s, p0/z, [%[a_ptr], #-0x30]\n"
+ ".inst 0x64a7e459 // fmmla z25.s, z2.s, z7.s\n"
+ "ld1rqw z2.s, p0/z, [%[a_ptr], #-0x20]\n"
+ ".inst 0x64a7e47f // fmmla z31.s, z3.s, z7.s\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[tails], 3f\n"
+ ".inst 0x64a4e408 // fmmla z8.s, z0.s, z4.s\n"
+ "ld1w z7.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ ".inst 0x64a4e42e // fmmla z14.s, z1.s, z4.s\n"
+ "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+ ".inst 0x64a4e454 // fmmla z20.s, z2.s, z4.s\n"
+ ".inst 0x64a5e409 // fmmla z9.s, z0.s, z5.s\n"
+ ".inst 0x64a5e42f // fmmla z15.s, z1.s, z5.s\n"
+ ".inst 0x64a4e47a // fmmla z26.s, z3.s, z4.s\n"
+ "ld1w z4.s, p0/z, [%[b_ptr]]\n"
+ ".inst 0x64a5e455 // fmmla z21.s, z2.s, z5.s\n"
+ ".inst 0x64a5e47b // fmmla z27.s, z3.s, z5.s\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ ".inst 0x64a6e40a // fmmla z10.s, z0.s, z6.s\n"
+ ".inst 0x64a6e430 // fmmla z16.s, z1.s, z6.s\n"
+ ".inst 0x64a6e456 // fmmla z22.s, z2.s, z6.s\n"
+ ".inst 0x64a6e47c // fmmla z28.s, z3.s, z6.s\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ ".inst 0x64a7e40b // fmmla z11.s, z0.s, z7.s\n"
+ ".inst 0x64a7e431 // fmmla z17.s, z1.s, z7.s\n"
+ ".inst 0x64a7e457 // fmmla z23.s, z2.s, z7.s\n"
+ ".inst 0x64a7e47d // fmmla z29.s, z3.s, z7.s\n"
+ "ld1w z7.s, p0/z, [%[b_ptr], #3, MUL VL]\n"
+ ".inst 0x64a4e40c // fmmla z12.s, z0.s, z4.s\n"
+ ".inst 0x64a4e432 // fmmla z18.s, z1.s, z4.s\n"
+ ".inst 0x64a4e458 // fmmla z24.s, z2.s, z4.s\n"
+ ".inst 0x64a4e47e // fmmla z30.s, z3.s, z4.s\n"
+ "ld1w z4.s, p0/z, [%[b_ptr], #4, MUL VL]\n"
+ ".inst 0x64a5e40d // fmmla z13.s, z0.s, z5.s\n"
+ "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
+ ".inst 0x64a5e433 // fmmla z19.s, z1.s, z5.s\n"
+ "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
+ ".inst 0x64a5e459 // fmmla z25.s, z2.s, z5.s\n"
+ "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
+ ".inst 0x64a5e47f // fmmla z31.s, z3.s, z5.s\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #5, MUL VL]\n"
+ ".inst 0x64a6e408 // fmmla z8.s, z0.s, z6.s\n"
+ "ld1rqw z3.s, p0/z, [%[a_ptr], #0x30]\n"
+ ".inst 0x64a6e42e // fmmla z14.s, z1.s, z6.s\n"
+ "add %[a_ptr], %[a_ptr], #0x80\n"
+ ".inst 0x64a6e454 // fmmla z20.s, z2.s, z6.s\n"
+ "addvl %[b_ptr], %[b_ptr], #14\n"
+ ".inst 0x64a6e47a // fmmla z26.s, z3.s, z6.s\n"
+ ".inst 0x64a7e409 // fmmla z9.s, z0.s, z7.s\n"
+ ".inst 0x64a7e42f // fmmla z15.s, z1.s, z7.s\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #-8, MUL VL]\n"
+ ".inst 0x64a7e455 // fmmla z21.s, z2.s, z7.s\n"
+ ".inst 0x64a7e47b // fmmla z27.s, z3.s, z7.s\n"
+ "ld1w z7.s, p0/z, [%[b_ptr], #-7, MUL VL]\n"
+ ".inst 0x64a4e40a // fmmla z10.s, z0.s, z4.s\n"
+ ".inst 0x64a4e430 // fmmla z16.s, z1.s, z4.s\n"
+ ".inst 0x64a4e456 // fmmla z22.s, z2.s, z4.s\n"
+ ".inst 0x64a4e47c // fmmla z28.s, z3.s, z4.s\n"
+ "ld1w z4.s, p0/z, [%[b_ptr], #-6, MUL VL]\n"
+ ".inst 0x64a5e40b // fmmla z11.s, z0.s, z5.s\n"
+ ".inst 0x64a5e431 // fmmla z17.s, z1.s, z5.s\n"
+ ".inst 0x64a5e457 // fmmla z23.s, z2.s, z5.s\n"
+ ".inst 0x64a5e47d // fmmla z29.s, z3.s, z5.s\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #-5, MUL VL]\n"
+ ".inst 0x64a6e40c // fmmla z12.s, z0.s, z6.s\n"
+ ".inst 0x64a6e432 // fmmla z18.s, z1.s, z6.s\n"
+ ".inst 0x64a6e458 // fmmla z24.s, z2.s, z6.s\n"
+ ".inst 0x64a6e47e // fmmla z30.s, z3.s, z6.s\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #-4, MUL VL]\n"
+ ".inst 0x64a7e40d // fmmla z13.s, z0.s, z7.s\n"
+ "ld1rqw z0.s, p0/z, [%[a_ptr], #-0x40]\n"
+ ".inst 0x64a7e433 // fmmla z19.s, z1.s, z7.s\n"
+ "ld1rqw z1.s, p0/z, [%[a_ptr], #-0x30]\n"
+ ".inst 0x64a7e459 // fmmla z25.s, z2.s, z7.s\n"
+ "ld1rqw z2.s, p0/z, [%[a_ptr], #-0x20]\n"
+ ".inst 0x64a7e47f // fmmla z31.s, z3.s, z7.s\n"
+ "ld1w z7.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+ ".inst 0x64a4e408 // fmmla z8.s, z0.s, z4.s\n"
+ "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+ ".inst 0x64a4e42e // fmmla z14.s, z1.s, z4.s\n"
+ ".inst 0x64a4e454 // fmmla z20.s, z2.s, z4.s\n"
+ ".inst 0x64a5e409 // fmmla z9.s, z0.s, z5.s\n"
+ ".inst 0x64a4e47a // fmmla z26.s, z3.s, z4.s\n"
+ "ld1w z4.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+ ".inst 0x64a5e42f // fmmla z15.s, z1.s, z5.s\n"
+ ".inst 0x64a5e455 // fmmla z21.s, z2.s, z5.s\n"
+ ".inst 0x64a5e47b // fmmla z27.s, z3.s, z5.s\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ ".inst 0x64a6e40a // fmmla z10.s, z0.s, z6.s\n"
+ ".inst 0x64a6e430 // fmmla z16.s, z1.s, z6.s\n"
+ ".inst 0x64a6e456 // fmmla z22.s, z2.s, z6.s\n"
+ ".inst 0x64a6e47c // fmmla z28.s, z3.s, z6.s\n"
+ "uzp1 z6.d, z14.d, z15.d\n"
+ ".inst 0x64a7e40b // fmmla z11.s, z0.s, z7.s\n"
+ ".inst 0x64a7e431 // fmmla z17.s, z1.s, z7.s\n"
+ ".inst 0x64a7e457 // fmmla z23.s, z2.s, z7.s\n"
+ ".inst 0x64a7e47d // fmmla z29.s, z3.s, z7.s\n"
+ ".inst 0x64a4e40c // fmmla z12.s, z0.s, z4.s\n"
+ "uzp1 z7.d, z16.d, z17.d\n"
+ ".inst 0x64a4e432 // fmmla z18.s, z1.s, z4.s\n"
+ ".inst 0x64a4e458 // fmmla z24.s, z2.s, z4.s\n"
+ ".inst 0x64a4e47e // fmmla z30.s, z3.s, z4.s\n"
+ "uzp2 z4.d, z10.d, z11.d\n"
+ ".inst 0x64a5e40d // fmmla z13.s, z0.s, z5.s\n"
+ "uzp1 z0.d, z8.d, z9.d\n"
+ ".inst 0x64a5e433 // fmmla z19.s, z1.s, z5.s\n"
+ "uzp1 z1.d, z10.d, z11.d\n"
+ ".inst 0x64a5e459 // fmmla z25.s, z2.s, z5.s\n"
+ "st1w z0.s, p0, [%[c_ptr]]\n"
+ "uzp1 z2.d, z12.d, z13.d\n"
+ "uzp1 z0.d, z18.d, z19.d\n"
+ ".inst 0x64a5e47f // fmmla z31.s, z3.s, z5.s\n"
+ "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "uzp2 z3.d, z8.d, z9.d\n"
+ "uzp2 z5.d, z12.d, z13.d\n"
+ "uzp2 z1.d, z14.d, z15.d\n"
+ "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "b 4f\n"
+ "3:\n"
+ ".inst 0x64a4e408 // fmmla z8.s, z0.s, z4.s\n"
+ "ld1w z7.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ ".inst 0x64a4e42e // fmmla z14.s, z1.s, z4.s\n"
+ "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+ ".inst 0x64a4e454 // fmmla z20.s, z2.s, z4.s\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ ".inst 0x64a5e409 // fmmla z9.s, z0.s, z5.s\n"
+ "addvl %[b_ptr], %[b_ptr], #8\n"
+ ".inst 0x64a4e47a // fmmla z26.s, z3.s, z4.s\n"
+ ".inst 0x64a5e42f // fmmla z15.s, z1.s, z5.s\n"
+ ".inst 0x64a5e455 // fmmla z21.s, z2.s, z5.s\n"
+ "ld1w z4.s, p0/z, [%[b_ptr], #-8, MUL VL]\n"
+ ".inst 0x64a5e47b // fmmla z27.s, z3.s, z5.s\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #-7, MUL VL]\n"
+ ".inst 0x64a6e40a // fmmla z10.s, z0.s, z6.s\n"
+ ".inst 0x64a6e430 // fmmla z16.s, z1.s, z6.s\n"
+ ".inst 0x64a6e456 // fmmla z22.s, z2.s, z6.s\n"
+ ".inst 0x64a6e47c // fmmla z28.s, z3.s, z6.s\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #-6, MUL VL]\n"
+ ".inst 0x64a7e40b // fmmla z11.s, z0.s, z7.s\n"
+ ".inst 0x64a7e431 // fmmla z17.s, z1.s, z7.s\n"
+ ".inst 0x64a7e457 // fmmla z23.s, z2.s, z7.s\n"
+ ".inst 0x64a7e47d // fmmla z29.s, z3.s, z7.s\n"
+ "ld1w z7.s, p0/z, [%[b_ptr], #-5, MUL VL]\n"
+ ".inst 0x64a4e40c // fmmla z12.s, z0.s, z4.s\n"
+ ".inst 0x64a4e432 // fmmla z18.s, z1.s, z4.s\n"
+ ".inst 0x64a4e458 // fmmla z24.s, z2.s, z4.s\n"
+ ".inst 0x64a4e47e // fmmla z30.s, z3.s, z4.s\n"
+ "ld1w z4.s, p0/z, [%[b_ptr], #-4, MUL VL]\n"
+ ".inst 0x64a5e40d // fmmla z13.s, z0.s, z5.s\n"
+ "ld1rqw z0.s, p0/z, [%[a_ptr], #-0x40]\n"
+ ".inst 0x64a5e433 // fmmla z19.s, z1.s, z5.s\n"
+ "ld1rqw z1.s, p0/z, [%[a_ptr], #-0x30]\n"
+ ".inst 0x64a5e459 // fmmla z25.s, z2.s, z5.s\n"
+ "ld1rqw z2.s, p0/z, [%[a_ptr], #-0x20]\n"
+ ".inst 0x64a5e47f // fmmla z31.s, z3.s, z5.s\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+ ".inst 0x64a6e408 // fmmla z8.s, z0.s, z6.s\n"
+ "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+ ".inst 0x64a6e42e // fmmla z14.s, z1.s, z6.s\n"
+ ".inst 0x64a6e454 // fmmla z20.s, z2.s, z6.s\n"
+ ".inst 0x64a7e409 // fmmla z9.s, z0.s, z7.s\n"
+ ".inst 0x64a6e47a // fmmla z26.s, z3.s, z6.s\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+ ".inst 0x64a7e42f // fmmla z15.s, z1.s, z7.s\n"
+ ".inst 0x64a7e455 // fmmla z21.s, z2.s, z7.s\n"
+ ".inst 0x64a7e47b // fmmla z27.s, z3.s, z7.s\n"
+ "ld1w z7.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ ".inst 0x64a4e40a // fmmla z10.s, z0.s, z4.s\n"
+ ".inst 0x64a4e430 // fmmla z16.s, z1.s, z4.s\n"
+ ".inst 0x64a4e456 // fmmla z22.s, z2.s, z4.s\n"
+ ".inst 0x64a4e47c // fmmla z28.s, z3.s, z4.s\n"
+ ".inst 0x64a5e40b // fmmla z11.s, z0.s, z5.s\n"
+ ".inst 0x64a5e431 // fmmla z17.s, z1.s, z5.s\n"
+ ".inst 0x64a5e457 // fmmla z23.s, z2.s, z5.s\n"
+ ".inst 0x64a5e47d // fmmla z29.s, z3.s, z5.s\n"
+ "uzp2 z4.d, z10.d, z11.d\n"
+ ".inst 0x64a6e40c // fmmla z12.s, z0.s, z6.s\n"
+ ".inst 0x64a6e432 // fmmla z18.s, z1.s, z6.s\n"
+ ".inst 0x64a6e458 // fmmla z24.s, z2.s, z6.s\n"
+ ".inst 0x64a6e47e // fmmla z30.s, z3.s, z6.s\n"
+ "uzp1 z6.d, z14.d, z15.d\n"
+ ".inst 0x64a7e40d // fmmla z13.s, z0.s, z7.s\n"
+ "uzp1 z0.d, z8.d, z9.d\n"
+ ".inst 0x64a7e433 // fmmla z19.s, z1.s, z7.s\n"
+ "uzp1 z1.d, z10.d, z11.d\n"
+ "uzp2 z5.d, z12.d, z13.d\n"
+ "st1w z0.s, p0, [%[c_ptr]]\n"
+ ".inst 0x64a7e459 // fmmla z25.s, z2.s, z7.s\n"
+ "uzp1 z2.d, z12.d, z13.d\n"
+ "uzp1 z0.d, z18.d, z19.d\n"
+ "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "uzp2 z1.d, z14.d, z15.d\n"
+ ".inst 0x64a7e47f // fmmla z31.s, z3.s, z7.s\n"
+ "uzp2 z3.d, z8.d, z9.d\n"
+ "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "uzp1 z7.d, z16.d, z17.d\n"
+ "4:\n"
+ "uzp2 z2.d, z16.d, z17.d\n"
+ "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "uzp2 z3.d, z18.d, z19.d\n"
+ "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "uzp1 z4.d, z20.d, z21.d\n"
+ "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "uzp1 z5.d, z22.d, z23.d\n"
+ "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "uzp1 z6.d, z24.d, z25.d\n"
+ "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
+ "uzp2 z7.d, z20.d, z21.d\n"
+ "st1w z0.s, p0, [%[c_ptr], #-8, MUL VL]\n"
+ "uzp2 z0.d, z22.d, z23.d\n"
+ "st1w z1.s, p0, [%[c_ptr], #-7, MUL VL]\n"
+ "uzp2 z1.d, z24.d, z25.d\n"
+ "st1w z2.s, p0, [%[c_ptr], #-6, MUL VL]\n"
+ "uzp1 z2.d, z26.d, z27.d\n"
+ "st1w z3.s, p0, [%[c_ptr], #-5, MUL VL]\n"
+ "uzp1 z3.d, z28.d, z29.d\n"
+ "st1w z4.s, p0, [%[c_ptr], #-4, MUL VL]\n"
+ "uzp1 z4.d, z30.d, z31.d\n"
+ "st1w z5.s, p0, [%[c_ptr], #-3, MUL VL]\n"
+ "uzp2 z5.d, z26.d, z27.d\n"
+ "st1w z6.s, p0, [%[c_ptr], #-2, MUL VL]\n"
+ "uzp2 z6.d, z28.d, z29.d\n"
+ "st1w z7.s, p0, [%[c_ptr], #-1, MUL VL]\n"
+ "uzp2 z7.d, z30.d, z31.d\n"
+ "st1w z0.s, p0, [%[c_ptr]]\n"
+ "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #8\n"
+ : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [loops] "+r" (loops), [tails] "+r" (tails)
+ :
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
index effdbc63c9..e40ba215b4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,7 +61,10 @@ public:
kern_type kernel=sve_interleaved_s8s32_dot_3VLx8;
- interleaved_s8s32_dot_3VLx8(const CPUInfo *ci) { UNUSED(ci); }
+ interleaved_s8s32_dot_3VLx8(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
index 7640fcaa20..cdc70705c5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -51,20 +51,20 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"mov z9.s, #0\n"
"mov z10.s, #0\n"
"mov z11.s, #0\n"
- "mov z12.s, #0\n"
"ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
- "mov z13.s, #0\n"
+ "mov z12.s, #0\n"
"ld1b z4.b, p0/z, [%[b_ptr]]\n"
- "mov z14.s, #0\n"
+ "mov z13.s, #0\n"
"ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
- "mov z15.s, #0\n"
+ "mov z14.s, #0\n"
"ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
- "mov z16.s, #0\n"
+ "mov z15.s, #0\n"
"ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
- "mov z17.s, #0\n"
+ "mov z16.s, #0\n"
"add %[a_ptr], %[a_ptr], #0x40\n"
- "mov z18.s, #0\n"
+ "mov z17.s, #0\n"
"addvl %[b_ptr], %[b_ptr], #3\n"
+ "mov z18.s, #0\n"
"mov z19.s, #0\n"
"mov z20.s, #0\n"
"mov z21.s, #0\n"
@@ -208,8 +208,8 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"sdot z9.s, z4.b, z0.b[1]\n"
"sdot z10.s, z4.b, z0.b[2]\n"
"sdot z11.s, z4.b, z0.b[3]\n"
- "sdot z20.s, z4.b, z1.b[0]\n"
"st1w z8.s, p0, [%[c_ptr]]\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
"sdot z21.s, z4.b, z1.b[1]\n"
"sdot z22.s, z4.b, z1.b[2]\n"
"sdot z23.s, z4.b, z1.b[3]\n"
@@ -217,8 +217,8 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"sdot z13.s, z5.b, z0.b[1]\n"
"sdot z14.s, z5.b, z0.b[2]\n"
"sdot z15.s, z5.b, z0.b[3]\n"
- "sdot z24.s, z5.b, z1.b[0]\n"
"st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "sdot z24.s, z5.b, z1.b[0]\n"
"sdot z25.s, z5.b, z1.b[1]\n"
"sdot z26.s, z5.b, z1.b[2]\n"
"sdot z27.s, z5.b, z1.b[3]\n"
@@ -226,10 +226,11 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"sdot z17.s, z6.b, z0.b[1]\n"
"sdot z18.s, z6.b, z0.b[2]\n"
"sdot z19.s, z6.b, z0.b[3]\n"
- "sdot z28.s, z6.b, z1.b[0]\n"
"st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "sdot z28.s, z6.b, z1.b[0]\n"
"sdot z29.s, z6.b, z1.b[1]\n"
"sdot z30.s, z6.b, z1.b[2]\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"sdot z31.s, z6.b, z1.b[3]\n"
"b 4f\n"
"3:\n"
@@ -267,8 +268,8 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"sdot z9.s, z4.b, z2.b[1]\n"
"sdot z10.s, z4.b, z2.b[2]\n"
"sdot z11.s, z4.b, z2.b[3]\n"
- "sdot z20.s, z4.b, z3.b[0]\n"
"st1w z8.s, p0, [%[c_ptr]]\n"
+ "sdot z20.s, z4.b, z3.b[0]\n"
"sdot z21.s, z4.b, z3.b[1]\n"
"sdot z22.s, z4.b, z3.b[2]\n"
"sdot z23.s, z4.b, z3.b[3]\n"
@@ -276,8 +277,8 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"sdot z13.s, z5.b, z2.b[1]\n"
"sdot z14.s, z5.b, z2.b[2]\n"
"sdot z15.s, z5.b, z2.b[3]\n"
- "sdot z24.s, z5.b, z3.b[0]\n"
"st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "sdot z24.s, z5.b, z3.b[0]\n"
"sdot z25.s, z5.b, z3.b[1]\n"
"sdot z26.s, z5.b, z3.b[2]\n"
"sdot z27.s, z5.b, z3.b[3]\n"
@@ -285,13 +286,13 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"sdot z17.s, z6.b, z2.b[1]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
"sdot z19.s, z6.b, z2.b[3]\n"
- "sdot z28.s, z6.b, z3.b[0]\n"
"st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "sdot z28.s, z6.b, z3.b[0]\n"
"sdot z29.s, z6.b, z3.b[1]\n"
"sdot z30.s, z6.b, z3.b[2]\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"sdot z31.s, z6.b, z3.b[3]\n"
"4:\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp
index cd50d0ded3..361598d594 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,9 +61,9 @@ public:
kern_type kernel=sve_interleaved_s8s32_mmla_3VLx8;
- interleaved_s8s32_mmla_3VLx8(const CPUInfo *ci)
+ interleaved_s8s32_mmla_3VLx8(const CPUInfo *)
{
- UNUSED(ci);
+
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp
index d636c9d2a4..cde9ec32e9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -63,13 +63,11 @@ void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel
"mov z16.s, #0\n"
"ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
"mov z17.s, #0\n"
- "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z18.s, #0\n"
- "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n"
+ "addvl %[b_ptr], %[b_ptr], #4\n"
"mov z19.s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z20.s, #0\n"
- "addvl %[b_ptr], %[b_ptr], #4\n"
"mov z21.s, #0\n"
"mov z22.s, #0\n"
"mov z23.s, #0\n"
@@ -84,12 +82,14 @@ void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel
"cbz %[loops], 1f\n"
"2:\n"
".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
- "subs %[loops], %[loops], #0x1\n"
+ "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
+ "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
"ld1b z4.b, p0/z, [%[b_ptr]]\n"
- ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
@@ -152,18 +152,18 @@ void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel
".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
"ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
".inst 0x4507987f // smmla z31.s, z3.b, z7.b\n"
- "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
- "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
"b.ne 2b\n"
"1:\n"
"cbz %[tails], 3f\n"
".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
+ "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
+ "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
- ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
- "ld1b z4.b, p0/z, [%[b_ptr]]\n"
".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
+ ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
+ "ld1b z4.b, p0/z, [%[b_ptr]]\n"
".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
"ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
@@ -269,15 +269,17 @@ void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel
"b 4f\n"
"3:\n"
".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
- "addvl %[b_ptr], %[b_ptr], #8\n"
+ "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
- ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
- "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
+ "addvl %[b_ptr], %[b_ptr], #8\n"
+ ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
+ "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
"ld1b z5.b, p0/z, [%[b_ptr], #-7, MUL VL]\n"
".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
index d3c8851154..252f38ec63 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,7 +61,10 @@ public:
kern_type kernel=sve_interleaved_u8u32_dot_3VLx8;
- interleaved_u8u32_dot_3VLx8(const CPUInfo *ci) { UNUSED(ci); }
+ interleaved_u8u32_dot_3VLx8(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
index f4d33a9efa..6626f8463b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -51,20 +51,20 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane
"mov z9.s, #0\n"
"mov z10.s, #0\n"
"mov z11.s, #0\n"
- "mov z12.s, #0\n"
"ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
- "mov z13.s, #0\n"
+ "mov z12.s, #0\n"
"ld1b z4.b, p0/z, [%[b_ptr]]\n"
- "mov z14.s, #0\n"
+ "mov z13.s, #0\n"
"ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
- "mov z15.s, #0\n"
+ "mov z14.s, #0\n"
"ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
- "mov z16.s, #0\n"
+ "mov z15.s, #0\n"
"ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
- "mov z17.s, #0\n"
+ "mov z16.s, #0\n"
"add %[a_ptr], %[a_ptr], #0x40\n"
- "mov z18.s, #0\n"
+ "mov z17.s, #0\n"
"addvl %[b_ptr], %[b_ptr], #3\n"
+ "mov z18.s, #0\n"
"mov z19.s, #0\n"
"mov z20.s, #0\n"
"mov z21.s, #0\n"
@@ -208,8 +208,8 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane
"udot z9.s, z4.b, z0.b[1]\n"
"udot z10.s, z4.b, z0.b[2]\n"
"udot z11.s, z4.b, z0.b[3]\n"
- "udot z20.s, z4.b, z1.b[0]\n"
"st1w z8.s, p0, [%[c_ptr]]\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
"udot z21.s, z4.b, z1.b[1]\n"
"udot z22.s, z4.b, z1.b[2]\n"
"udot z23.s, z4.b, z1.b[3]\n"
@@ -217,8 +217,8 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane
"udot z13.s, z5.b, z0.b[1]\n"
"udot z14.s, z5.b, z0.b[2]\n"
"udot z15.s, z5.b, z0.b[3]\n"
- "udot z24.s, z5.b, z1.b[0]\n"
"st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "udot z24.s, z5.b, z1.b[0]\n"
"udot z25.s, z5.b, z1.b[1]\n"
"udot z26.s, z5.b, z1.b[2]\n"
"udot z27.s, z5.b, z1.b[3]\n"
@@ -226,10 +226,11 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane
"udot z17.s, z6.b, z0.b[1]\n"
"udot z18.s, z6.b, z0.b[2]\n"
"udot z19.s, z6.b, z0.b[3]\n"
- "udot z28.s, z6.b, z1.b[0]\n"
"st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "udot z28.s, z6.b, z1.b[0]\n"
"udot z29.s, z6.b, z1.b[1]\n"
"udot z30.s, z6.b, z1.b[2]\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"udot z31.s, z6.b, z1.b[3]\n"
"b 4f\n"
"3:\n"
@@ -267,8 +268,8 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane
"udot z9.s, z4.b, z2.b[1]\n"
"udot z10.s, z4.b, z2.b[2]\n"
"udot z11.s, z4.b, z2.b[3]\n"
- "udot z20.s, z4.b, z3.b[0]\n"
"st1w z8.s, p0, [%[c_ptr]]\n"
+ "udot z20.s, z4.b, z3.b[0]\n"
"udot z21.s, z4.b, z3.b[1]\n"
"udot z22.s, z4.b, z3.b[2]\n"
"udot z23.s, z4.b, z3.b[3]\n"
@@ -276,8 +277,8 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane
"udot z13.s, z5.b, z2.b[1]\n"
"udot z14.s, z5.b, z2.b[2]\n"
"udot z15.s, z5.b, z2.b[3]\n"
- "udot z24.s, z5.b, z3.b[0]\n"
"st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "udot z24.s, z5.b, z3.b[0]\n"
"udot z25.s, z5.b, z3.b[1]\n"
"udot z26.s, z5.b, z3.b[2]\n"
"udot z27.s, z5.b, z3.b[3]\n"
@@ -285,13 +286,13 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane
"udot z17.s, z6.b, z2.b[1]\n"
"udot z18.s, z6.b, z2.b[2]\n"
"udot z19.s, z6.b, z2.b[3]\n"
- "udot z28.s, z6.b, z3.b[0]\n"
"st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "udot z28.s, z6.b, z3.b[0]\n"
"udot z29.s, z6.b, z3.b[1]\n"
"udot z30.s, z6.b, z3.b[2]\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"udot z31.s, z6.b, z3.b[3]\n"
"4:\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp
index 9b5ca1049e..ed44a9d8fc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,9 +61,9 @@ public:
kern_type kernel=sve_interleaved_u8u32_mmla_3VLx8;
- interleaved_u8u32_mmla_3VLx8(const CPUInfo *ci)
+ interleaved_u8u32_mmla_3VLx8(const CPUInfo *)
{
- UNUSED(ci);
+
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp
index 15cc8fb897..81a1dbcf51 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -63,13 +63,11 @@ void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpan
"mov z16.s, #0\n"
"ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
"mov z17.s, #0\n"
- "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z18.s, #0\n"
- "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n"
+ "addvl %[b_ptr], %[b_ptr], #4\n"
"mov z19.s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z20.s, #0\n"
- "addvl %[b_ptr], %[b_ptr], #4\n"
"mov z21.s, #0\n"
"mov z22.s, #0\n"
"mov z23.s, #0\n"
@@ -84,12 +82,14 @@ void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpan
"cbz %[loops], 1f\n"
"2:\n"
".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
- "subs %[loops], %[loops], #0x1\n"
+ "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
+ "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
"ld1b z4.b, p0/z, [%[b_ptr]]\n"
- ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
@@ -152,18 +152,18 @@ void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpan
".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
"ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
".inst 0x45c7987f // ummla z31.s, z3.b, z7.b\n"
- "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
- "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
"b.ne 2b\n"
"1:\n"
"cbz %[tails], 3f\n"
".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
+ "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
+ "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
- ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
- "ld1b z4.b, p0/z, [%[b_ptr]]\n"
".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
+ ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
+ "ld1b z4.b, p0/z, [%[b_ptr]]\n"
".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
"ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
@@ -269,15 +269,17 @@ void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpan
"b 4f\n"
"3:\n"
".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
- "addvl %[b_ptr], %[b_ptr], #8\n"
+ "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
- ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
- "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
+ "addvl %[b_ptr], %[b_ptr], #8\n"
+ ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
+ "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
"ld1b z5.b, p0/z, [%[b_ptr], #-7, MUL VL]\n"
".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4.hpp
index 59103d2407..6738809934 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_native_bf16fp32_dot_4VLx4;
- native_bf16fp32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+ native_bf16fp32_dot_4VLx4(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp
index ce1971b2c5..d3bd89b8c5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,12 +62,23 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
break;
}
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const bfloat16 * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(bfloat16);
float *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
long loops = loops_count;
@@ -82,7 +93,7 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
const unsigned long ldcb = ldc * sizeof(float);
const float *biasptr = bias ? bias+x0 : nullbias;
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"whilelt p6.h, %[temp], %[leftovers]\n"
@@ -235,46 +246,46 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
"b.ne 2b\n"
"1:\n"
"zip1 z12.h, z13.h, z14.h\n"
- "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"zip2 z13.h, z13.h, z14.h\n"
"cbz %[regs], 3f\n"
".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
"ld1h z9.h, p4/z, [%[b_ptr0]]\n"
"zip1 z14.h, z15.h, z8.h\n"
"ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"zip2 z15.h, z15.h, z8.h\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
+ "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
+ "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
+ ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z8.h, z9.h, z10.h\n"
"ld1h z13.h, p4/z, [%[b_ptr0]]\n"
"zip2 z9.h, z9.h, z10.h\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"zip1 z10.h, z11.h, z12.h\n"
- "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.h, z11.h, z12.h\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
"ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
- "zip1 z12.h, z13.h, z14.h\n"
+ ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z12.h, z13.h, z14.h\n"
+ "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
"zip2 z13.h, z13.h, z14.h\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.h, z15.h, z8.h\n"
"zip2 z15.h, z15.h, z8.h\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
"ld1h z10.h, p4/z, [%[b_ptr1]]\n"
".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
@@ -452,42 +463,43 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
"b 7f\n"
"3:\n"
".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
+ "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.h, z15.h, z8.h\n"
"ld1h z9.h, p4/z, [%[b_ptr0]]\n"
"zip2 z15.h, z15.h, z8.h\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
"ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
+ ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
+ "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
+ "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
+ ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z8.h, z9.h, z10.h\n"
"ld1h z13.h, p4/z, [%[b_ptr0]]\n"
"zip2 z9.h, z9.h, z10.h\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"zip1 z10.h, z11.h, z12.h\n"
- "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.h, z11.h, z12.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip1 z12.h, z13.h, z14.h\n"
- "zip2 z13.h, z13.h, z14.h\n"
+ "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
"ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ "zip1 z12.h, z13.h, z14.h\n"
+ "zip2 z13.h, z13.h, z14.h\n"
"zip1 z14.h, z15.h, z8.h\n"
"zip2 z15.h, z15.h, z8.h\n"
+ ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
+ ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
@@ -666,37 +678,37 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
"zip2 z11.h, z11.h, z12.h\n"
"ld1h z13.h, p4/z, [%[b_ptr0]]\n"
"ld1h z14.h, p4/z, [%[b_ptr1]]\n"
- "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"mov z23.d, z19.d\n"
"cbz %[loops], 1f\n"
"2:\n"
"zip1 z12.h, z13.h, z14.h\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"zip2 z13.h, z13.h, z14.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
"ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[loops], %[loops], #0x1\n"
".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.h, z15.h, z8.h\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
"zip2 z15.h, z15.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
"ld1h z10.h, p4/z, [%[b_ptr1]]\n"
".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
"ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z8.h, z9.h, z10.h\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z9.h, z9.h, z10.h\n"
+ ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
"ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
@@ -820,26 +832,26 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
"ld1h z14.h, p4/z, [%[b_ptr1]]\n"
".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"b.ne 2b\n"
"1:\n"
"zip1 z12.h, z13.h, z14.h\n"
"zip2 z13.h, z13.h, z14.h\n"
"cbz %[regs], 3f\n"
".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
"ld1h z9.h, p4/z, [%[b_ptr0]]\n"
"zip1 z14.h, z15.h, z8.h\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.h, z15.h, z8.h\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
"ld1h z10.h, p4/z, [%[b_ptr1]]\n"
".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
@@ -1103,28 +1115,29 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
"b 7f\n"
"3:\n"
".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
"ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.h, z15.h, z8.h\n"
"ld1h z9.h, p4/z, [%[b_ptr0]]\n"
"zip2 z15.h, z15.h, z8.h\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
"ld1h z10.h, p4/z, [%[b_ptr1]]\n"
".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
"ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"zip1 z8.h, z9.h, z10.h\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z9.h, z9.h, z10.h\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
"ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
@@ -1386,34 +1399,34 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z27.d, z19.d\n"
"ld1h z13.h, p4/z, [%[b_ptr0]]\n"
- "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1h z14.h, p4/z, [%[b_ptr1]]\n"
"cbz %[loops], 1f\n"
"2:\n"
".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"zip1 z12.h, z13.h, z14.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
"zip2 z13.h, z13.h, z14.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
"ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[loops], %[loops], #0x1\n"
".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.h, z15.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.h, z15.h, z8.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
"ld1h z9.h, p4/z, [%[b_ptr0]]\n"
".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
"ld1h z10.h, p4/z, [%[b_ptr1]]\n"
".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
@@ -1576,28 +1589,28 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
- "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"b.ne 2b\n"
"1:\n"
"zip1 z12.h, z13.h, z14.h\n"
"zip2 z13.h, z13.h, z14.h\n"
"cbz %[regs], 3f\n"
".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
"ld1h z9.h, p4/z, [%[b_ptr0]]\n"
"zip1 z14.h, z15.h, z8.h\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.h, z15.h, z8.h\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
"ld1h z10.h, p4/z, [%[b_ptr1]]\n"
@@ -1922,35 +1935,36 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
"b 7f\n"
"3:\n"
".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
"ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
+ "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+ ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.h, z15.h, z8.h\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "zip2 z15.h, z15.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
"ld1h z9.h, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.h, z15.h, z8.h\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
"ld1h z10.h, p4/z, [%[b_ptr1]]\n"
".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"zip1 z8.h, z9.h, z10.h\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z9.h, z9.h, z10.h\n"
+ ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
+ "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
"ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
@@ -2276,7 +2290,6 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z31.d, z19.d\n"
"ld1h z13.h, p4/z, [%[b_ptr0]]\n"
- "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1h z14.h, p4/z, [%[b_ptr1]]\n"
"zip1 z12.h, z13.h, z14.h\n"
@@ -2284,38 +2297,39 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
"cbz %[loops], 1f\n"
"2:\n"
".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
+ "ld1rqh z7.h, p7/z, [a_ptr3]\n"
".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
+ "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.h, z15.h, z8.h\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "zip2 z15.h, z15.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
"ld1h z9.h, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.h, z15.h, z8.h\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
"ld1h z10.h, p4/z, [%[b_ptr1]]\n"
".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
"zip1 z8.h, z9.h, z10.h\n"
"zip2 z9.h, z9.h, z10.h\n"
+ ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
"ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
@@ -2503,28 +2517,28 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
"zip1 z12.h, z13.h, z14.h\n"
"zip2 z13.h, z13.h, z14.h\n"
".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
- "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
"b.ne 2b\n"
"1:\n"
"cbz %[regs], 3f\n"
".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
+ "ld1rqh z7.h, p7/z, [a_ptr3]\n"
".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "zip1 z14.h, z15.h, z8.h\n"
- "zip2 z15.h, z15.h, z8.h\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
"ld1h z9.h, p4/z, [%[b_ptr0]]\n"
+ "zip1 z14.h, z15.h, z8.h\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.h, z15.h, z8.h\n"
".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
@@ -2910,30 +2924,31 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B
"b 7f\n"
"3:\n"
".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
"ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
+ "ld1rqh z6.h, p6/z, [a_ptr2]\n"
".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
+ "ld1rqh z7.h, p6/z, [a_ptr3]\n"
+ ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.h, z15.h, z8.h\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.h, z15.h, z8.h\n"
"addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
"ld1h z9.h, p4/z, [%[b_ptr0]]\n"
".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
+ "addvl a_ptr3, a_ptr3, #1\n"
".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
"ld1h z10.h, p4/z, [%[b_ptr1]]\n"
".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp
index 741f200d25..665e8656d2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,9 @@
#ifdef __ARM_FEATURE_SVE
+
+
+
namespace arm_gemm
{
@@ -75,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_native_fp16_mla_4VLx4;
- native_fp16_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+ native_fp16_mla_4VLx4(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp
index 14dd38bd25..dd33c785cf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -60,12 +60,23 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld
break;
}
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const __fp16 * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(__fp16);
__fp16 *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
long loops = loops_count;
@@ -78,7 +89,7 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld
const unsigned long ldcb = ldc * sizeof(__fp16);
const __fp16 *biasptr = bias ? bias+x0 : nullbias;
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"whilelt p6.h, %[temp], %[leftovers]\n"
@@ -256,88 +267,87 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld
"ld1h z12.h, p0/z, [%[b_ptr0]]\n"
"ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"cbz %[regs], 3f\n"
"fmla z16.h, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z17.h, z9.h, z0.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
"fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z19.h, z11.h, z0.h[0]\n"
- "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
"fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z17.h, z13.h, z0.h[1]\n"
- "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z18.h, z14.h, z0.h[1]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z19.h, z15.h, z0.h[1]\n"
- "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z16.h, z8.h, z0.h[2]\n"
- "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
"fmla z17.h, z9.h, z0.h[2]\n"
- "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z18.h, z10.h, z0.h[2]\n"
- "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z19.h, z11.h, z0.h[2]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.h, z12.h, z0.h[3]\n"
- "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z17.h, z13.h, z0.h[3]\n"
- "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
"fmla z18.h, z14.h, z0.h[3]\n"
- "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
"ld1h z12.h, p0/z, [%[b_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
"ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
"ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
"ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z17.h, z13.h, z0.h[5]\n"
- "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
"fmla z18.h, z14.h, z0.h[5]\n"
- "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
"ld1h z12.h, p0/z, [%[b_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
"ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
"ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
"ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z16.h, z12.h, z0.h[7]\n"
- "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z17.h, z13.h, z0.h[7]\n"
- "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
"fmla z18.h, z14.h, z0.h[7]\n"
- "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
"ld1h z12.h, p0/z, [%[b_ptr0]]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
"ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
"ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
"ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
"fmla z17.h, z13.h, z4.h[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z18.h, z14.h, z4.h[1]\n"
@@ -345,51 +355,52 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld
"fmla z19.h, z15.h, z4.h[1]\n"
"ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[a_ptr0], %[a_ptr0], #2\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
"ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
"ld1h z12.h, p0/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
"ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
"ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
"ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
"ld1h z8.h, p0/z, [%[b_ptr0]]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
"ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
"ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
"ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z16.h, z8.h, z4.h[4]\n"
- "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z17.h, z9.h, z4.h[4]\n"
- "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
"fmla z18.h, z10.h, z4.h[4]\n"
- "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z19.h, z11.h, z4.h[4]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
"ld1h z8.h, p0/z, [%[b_ptr0]]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
"ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
"ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
"ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z16.h, z8.h, z4.h[6]\n"
- "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z17.h, z9.h, z4.h[6]\n"
- "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
"fmla z18.h, z10.h, z4.h[6]\n"
- "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z19.h, z11.h, z4.h[6]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.h, z12.h, z4.h[7]\n"
"fmla z17.h, z13.h, z4.h[7]\n"
@@ -474,66 +485,67 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld
"b 4f\n"
"3:\n"
"fmla z16.h, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z17.h, z9.h, z0.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
"fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z19.h, z11.h, z0.h[0]\n"
- "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
"fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z17.h, z13.h, z0.h[1]\n"
- "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z18.h, z14.h, z0.h[1]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z19.h, z15.h, z0.h[1]\n"
- "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z16.h, z8.h, z0.h[2]\n"
- "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
"fmla z17.h, z9.h, z0.h[2]\n"
- "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z18.h, z10.h, z0.h[2]\n"
- "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z19.h, z11.h, z0.h[2]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.h, z12.h, z0.h[3]\n"
- "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z17.h, z13.h, z0.h[3]\n"
- "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
"fmla z18.h, z14.h, z0.h[3]\n"
- "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
"ld1h z12.h, p0/z, [%[b_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
"ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
"ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
"ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z17.h, z13.h, z0.h[5]\n"
- "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
"fmla z18.h, z14.h, z0.h[5]\n"
- "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
"ld1h z12.h, p0/z, [%[b_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
"ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
"ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
"ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
"fmla z16.h, z12.h, z0.h[7]\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
"fmla z17.h, z13.h, z0.h[7]\n"
"fmla z18.h, z14.h, z0.h[7]\n"
"fmla z19.h, z15.h, z0.h[7]\n"
@@ -888,21 +900,21 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld
"fmla z23.h, z15.h, z5.h[7]\n"
"b.ne 2b\n"
"1:\n"
- "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"cbz %[regs], 3f\n"
"fmla z16.h, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
"fmla z17.h, z9.h, z0.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
"fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
"fmla z22.h, z10.h, z1.h[0]\n"
- "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z23.h, z11.h, z1.h[0]\n"
"ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.h, z12.h, z0.h[1]\n"
@@ -1201,19 +1213,19 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld
"b 4f\n"
"3:\n"
"fmla z16.h, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
"fmla z17.h, z9.h, z0.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
"fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
"fmla z22.h, z10.h, z1.h[0]\n"
- "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z19.h, z11.h, z0.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z23.h, z11.h, z1.h[0]\n"
"ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.h, z12.h, z0.h[1]\n"
@@ -1221,10 +1233,11 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld
"fmla z20.h, z12.h, z1.h[1]\n"
"ld1h z12.h, p0/z, [%[b_ptr0]]\n"
"fmla z17.h, z13.h, z0.h[1]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
"fmla z21.h, z13.h, z1.h[1]\n"
"ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z18.h, z14.h, z0.h[1]\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
"fmla z22.h, z14.h, z1.h[1]\n"
"ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z19.h, z15.h, z0.h[1]\n"
@@ -1509,9 +1522,9 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld
"fmla z27.h, z11.h, z2.h[0]\n"
"ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.h, z12.h, z0.h[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
"fmla z24.h, z12.h, z2.h[1]\n"
"ld1h z12.h, p0/z, [%[b_ptr0]]\n"
"fmla z17.h, z13.h, z0.h[1]\n"
@@ -1768,21 +1781,21 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld
"fmla z27.h, z15.h, z6.h[7]\n"
"b.ne 2b\n"
"1:\n"
- "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"cbz %[regs], 3f\n"
"fmla z16.h, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
"fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
"fmla z17.h, z9.h, z0.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
"fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z25.h, z9.h, z2.h[0]\n"
- "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
"fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z22.h, z10.h, z1.h[0]\n"
"fmla z26.h, z10.h, z2.h[0]\n"
"ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
@@ -2176,26 +2189,27 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld
"b 4f\n"
"3:\n"
"fmla z16.h, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
"fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
"fmla z17.h, z9.h, z0.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqh z6.h, p6/z, [a_ptr2]\n"
"fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z25.h, z9.h, z2.h[0]\n"
- "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
"fmla z18.h, z10.h, z0.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z22.h, z10.h, z1.h[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
"fmla z26.h, z10.h, z2.h[0]\n"
"ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z19.h, z11.h, z0.h[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
"fmla z23.h, z11.h, z1.h[0]\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
"fmla z27.h, z11.h, z2.h[0]\n"
"ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.h, z12.h, z0.h[1]\n"
@@ -2897,21 +2911,21 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld
"fmla z31.h, z15.h, z7.h[7]\n"
"b.ne 2b\n"
"1:\n"
- "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"cbz %[regs], 3f\n"
"fmla z16.h, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
"fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
"fmla z28.h, z8.h, z3.h[0]\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
"fmla z17.h, z9.h, z0.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqh z7.h, p7/z, [a_ptr3]\n"
"fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
"fmla z29.h, z9.h, z3.h[0]\n"
"ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z18.h, z10.h, z0.h[0]\n"
@@ -3400,30 +3414,31 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld
"b 4f\n"
"3:\n"
"fmla z16.h, z8.h, z0.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
"fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
"fmla z28.h, z8.h, z3.h[0]\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
+ "ld1rqh z6.h, p6/z, [a_ptr2]\n"
"fmla z17.h, z9.h, z0.h[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqh z7.h, p6/z, [a_ptr3]\n"
"fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z25.h, z9.h, z2.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
"fmla z29.h, z9.h, z3.h[0]\n"
"ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z18.h, z10.h, z0.h[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
"fmla z22.h, z10.h, z1.h[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
"fmla z26.h, z10.h, z2.h[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
"fmla z30.h, z10.h, z3.h[0]\n"
"ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z19.h, z11.h, z0.h[0]\n"
+ "addvl a_ptr3, a_ptr3, #1\n"
"fmla z23.h, z11.h, z1.h[0]\n"
"fmla z27.h, z11.h, z2.h[0]\n"
"fmla z31.h, z11.h, z3.h[0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
index 19e5fbd974..0abde56af1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,9 @@
#ifdef __ARM_FEATURE_SVE
+
+
+
namespace arm_gemm
{
@@ -75,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_native_fp32_mla_4VLx4;
- native_fp32_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+ native_fp32_mla_4VLx4(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp
index 3fc0e5fa36..b05906e199 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -60,12 +60,23 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb,
break;
}
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const float * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(float);
float *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
long loops = loops_count;
@@ -78,7 +89,7 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb,
const unsigned long ldcb = ldc * sizeof(float);
const float *biasptr = bias ? bias+x0 : nullbias;
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"whilelt p6.s, %[temp], %[leftovers]\n"
@@ -184,52 +195,51 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb,
"ld1w z12.s, p0/z, [%[b_ptr0]]\n"
"ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"cbz %[regs], 3f\n"
"fmla z16.s, z8.s, z0.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z17.s, z9.s, z0.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
"fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z19.s, z11.s, z0.s[0]\n"
- "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
"fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z17.s, z13.s, z0.s[1]\n"
- "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z18.s, z14.s, z0.s[1]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z19.s, z15.s, z0.s[1]\n"
- "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z16.s, z8.s, z0.s[2]\n"
- "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
"fmla z17.s, z9.s, z0.s[2]\n"
- "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z18.s, z10.s, z0.s[2]\n"
- "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z19.s, z11.s, z0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.s, z12.s, z0.s[3]\n"
- "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z17.s, z13.s, z0.s[3]\n"
- "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
"fmla z18.s, z14.s, z0.s[3]\n"
- "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
"ld1w z12.s, p0/z, [%[b_ptr0]]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
"ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
"ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
"ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
"fmla z17.s, z13.s, z4.s[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z18.s, z14.s, z4.s[1]\n"
@@ -237,15 +247,16 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb,
"fmla z19.s, z15.s, z4.s[1]\n"
"ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[a_ptr0], %[a_ptr0], #2\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
"ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
"ld1w z12.s, p0/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
"ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
"ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
"ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.s, z12.s, z4.s[3]\n"
"fmla z17.s, z13.s, z4.s[3]\n"
@@ -286,30 +297,31 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb,
"b 4f\n"
"3:\n"
"fmla z16.s, z8.s, z0.s[0]\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z17.s, z9.s, z0.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
"fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z19.s, z11.s, z0.s[0]\n"
- "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
"fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z17.s, z13.s, z0.s[1]\n"
- "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z18.s, z14.s, z0.s[1]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z19.s, z15.s, z0.s[1]\n"
- "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z16.s, z8.s, z0.s[2]\n"
- "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
"fmla z17.s, z9.s, z0.s[2]\n"
- "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z18.s, z10.s, z0.s[2]\n"
- "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z19.s, z11.s, z0.s[2]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.s, z12.s, z0.s[3]\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
"fmla z17.s, z13.s, z0.s[3]\n"
"fmla z18.s, z14.s, z0.s[3]\n"
"fmla z19.s, z15.s, z0.s[3]\n"
@@ -516,21 +528,21 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb,
"fmla z23.s, z15.s, z5.s[3]\n"
"b.ne 2b\n"
"1:\n"
- "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"cbz %[regs], 3f\n"
"fmla z16.s, z8.s, z0.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
"fmla z17.s, z9.s, z0.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
"fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
"fmla z22.s, z10.s, z1.s[0]\n"
- "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z23.s, z11.s, z1.s[0]\n"
"ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.s, z12.s, z0.s[1]\n"
@@ -665,19 +677,19 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb,
"b 4f\n"
"3:\n"
"fmla z16.s, z8.s, z0.s[0]\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
"fmla z17.s, z9.s, z0.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
"fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
"fmla z22.s, z10.s, z1.s[0]\n"
- "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z19.s, z11.s, z0.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z23.s, z11.s, z1.s[0]\n"
"ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.s, z12.s, z0.s[1]\n"
@@ -685,10 +697,11 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb,
"fmla z20.s, z12.s, z1.s[1]\n"
"ld1w z12.s, p0/z, [%[b_ptr0]]\n"
"fmla z17.s, z13.s, z0.s[1]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
"fmla z21.s, z13.s, z1.s[1]\n"
"ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z18.s, z14.s, z0.s[1]\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
"fmla z22.s, z14.s, z1.s[1]\n"
"ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z19.s, z15.s, z0.s[1]\n"
@@ -861,9 +874,9 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb,
"fmla z27.s, z11.s, z2.s[0]\n"
"ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.s, z12.s, z0.s[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
"fmla z24.s, z12.s, z2.s[1]\n"
"ld1w z12.s, p0/z, [%[b_ptr0]]\n"
"fmla z17.s, z13.s, z0.s[1]\n"
@@ -984,21 +997,21 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb,
"fmla z27.s, z15.s, z6.s[3]\n"
"b.ne 2b\n"
"1:\n"
- "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"cbz %[regs], 3f\n"
"fmla z16.s, z8.s, z0.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
"fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
"fmla z17.s, z9.s, z0.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
"fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z25.s, z9.s, z2.s[0]\n"
- "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
"fmla z18.s, z10.s, z0.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z22.s, z10.s, z1.s[0]\n"
"fmla z26.s, z10.s, z2.s[0]\n"
"ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
@@ -1180,26 +1193,27 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb,
"b 4f\n"
"3:\n"
"fmla z16.s, z8.s, z0.s[0]\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
"fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
"fmla z17.s, z9.s, z0.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
"fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z25.s, z9.s, z2.s[0]\n"
- "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
"fmla z18.s, z10.s, z0.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z22.s, z10.s, z1.s[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
"fmla z26.s, z10.s, z2.s[0]\n"
"ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z19.s, z11.s, z0.s[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
"fmla z23.s, z11.s, z1.s[0]\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
"fmla z27.s, z11.s, z2.s[0]\n"
"ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z16.s, z12.s, z0.s[1]\n"
@@ -1589,21 +1603,21 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb,
"fmla z31.s, z15.s, z7.s[3]\n"
"b.ne 2b\n"
"1:\n"
- "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"cbz %[regs], 3f\n"
"fmla z16.s, z8.s, z0.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
"fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
"fmla z28.s, z8.s, z3.s[0]\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
"fmla z17.s, z9.s, z0.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
"fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z25.s, z9.s, z2.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
"fmla z29.s, z9.s, z3.s[0]\n"
"ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z18.s, z10.s, z0.s[0]\n"
@@ -1832,30 +1846,31 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb,
"b 4f\n"
"3:\n"
"fmla z16.s, z8.s, z0.s[0]\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
"fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
"fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
"fmla z28.s, z8.s, z3.s[0]\n"
- "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
"fmla z17.s, z9.s, z0.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqw z7.s, p6/z, [a_ptr3]\n"
"fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"fmla z25.s, z9.s, z2.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
"fmla z29.s, z9.s, z3.s[0]\n"
"ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
"fmla z18.s, z10.s, z0.s[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
"fmla z22.s, z10.s, z1.s[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
"fmla z26.s, z10.s, z2.s[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
"fmla z30.s, z10.s, z3.s[0]\n"
"ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
"fmla z19.s, z11.s, z0.s[0]\n"
+ "addvl a_ptr3, a_ptr3, #1\n"
"fmla z23.s, z11.s, z1.s[0]\n"
"fmla z27.s, z11.s, z2.s[0]\n"
"fmla z31.s, z11.s, z3.s[0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
index 1b9d1312b5..40a69b54ff 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_native_s8s32_dot_4VLx4;
- native_s8s32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+ native_s8s32_dot_4VLx4(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
index 26736f597a..7c5d4dc280 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
@@ -32,7 +32,7 @@
namespace arm_gemm {
-void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int ldb, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool append) {
+void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int ldb, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool append) {
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
const long regs_count = (K / 16) - 1;
@@ -41,12 +41,23 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
const long blocks_count = K / 4;
const long odds_count = K - (blocks_count * 4);
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const int8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(int8_t);
int32_t *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
long loops = loops_count;
@@ -62,7 +73,7 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
long ldbb = ldb * sizeof(int8_t) * 4;
const unsigned long ldcb = ldc * sizeof(int32_t);
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"mov z16.s, #0\n"
@@ -270,22 +281,22 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"cbz %[regs], 3f\n"
"sdot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
@@ -635,33 +646,34 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"b 7f\n"
"3:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
+ "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z13.b, z13.b, z14.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "zip1 z14.b, z15.b, z8.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z0.b[1]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"addvl %[a_ptr0], %[a_ptr0], #1\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z0.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
@@ -998,11 +1010,11 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z9.b, z1.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z22.s, z10.b, z1.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
@@ -1176,34 +1188,34 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"1:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"cbz %[regs], 3f\n"
"sdot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "zip2 z8.b, z14.b, z12.b\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
+ "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1604,34 +1616,35 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"b 7f\n"
"3:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "zip2 z8.b, z14.b, z12.b\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
+ "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
- "addvl a_ptr1, a_ptr1, #1\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
"zip2 z11.b, z8.b, z9.b\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
"sdot z16.s, z12.b, z0.b[1]\n"
@@ -2242,19 +2255,20 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"1:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"cbz %[regs], 3f\n"
"sdot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
@@ -2262,13 +2276,12 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z25.s, z9.b, z2.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
@@ -2733,16 +2746,18 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"b 7f\n"
"3:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
"sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p6/z, [a_ptr2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
@@ -2752,19 +2767,18 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"addvl a_ptr1, a_ptr1, #1\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z25.s, z9.b, z2.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z10.b, z1.b[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
"sdot z27.s, z11.b, z2.b[0]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -3469,25 +3483,25 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"1:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"cbz %[regs], 3f\n"
"sdot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
"sdot z28.s, z8.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z9.b, z0.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
@@ -4023,38 +4037,39 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"b 7f\n"
"3:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
"sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
"sdot z28.s, z8.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z9.b, z0.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
- "addvl a_ptr1, a_ptr1, #1\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"sdot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z25.s, z9.b, z2.b[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
"sdot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z22.s, z10.b, z1.b[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
"sdot z26.s, z10.b, z2.b[0]\n"
+ "addvl a_ptr3, a_ptr3, #1\n"
"sdot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
index 33e3ac6c23..043fa7484a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,7 +78,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_native_u8u32_dot_4VLx4;
- native_u8u32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+ native_u8u32_dot_4VLx4(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
index 639ca5765c..bbc1092e4e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
@@ -32,7 +32,7 @@
namespace arm_gemm {
-void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int ldb, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool append) {
+void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int ldb, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool append) {
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
const long regs_count = (K / 16) - 1;
@@ -41,12 +41,23 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
const long blocks_count = K / 4;
const long odds_count = K - (blocks_count * 4);
- for (int y=0; y<M; y+=4) {
+ int rows_to_compute;
+
+ for (int y=0; y<M; y+=rows_to_compute) {
const uint8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(uint8_t);
uint32_t *c_ptr0 = C + (y * ldc);
+ rows_to_compute = M-y;
+ if (rows_to_compute > 4) {
+ if (rows_to_compute % 4) {
+ rows_to_compute = 4 - 1;
+ } else {
+ rows_to_compute = 4;
+ }
+ }
+
for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
long loops = loops_count;
@@ -62,7 +73,7 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
long ldbb = ldb * sizeof(uint8_t) * 4;
const unsigned long ldcb = ldc * sizeof(uint32_t);
- switch(M-y) {
+ switch(rows_to_compute) {
case 1:
__asm __volatile (
"mov z16.s, #0\n"
@@ -270,22 +281,22 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"cbz %[regs], 3f\n"
"udot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"udot z19.s, z11.b, z0.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
@@ -635,33 +646,34 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"b 7f\n"
"3:\n"
"udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
+ "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z13.b, z13.b, z14.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "zip1 z14.b, z15.b, z8.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z0.b[1]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"addvl %[a_ptr0], %[a_ptr0], #1\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z0.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[1]\n"
@@ -998,11 +1010,11 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z9.b, z1.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z22.s, z10.b, z1.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
@@ -1176,34 +1188,34 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"1:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"cbz %[regs], 3f\n"
"udot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "zip2 z8.b, z14.b, z12.b\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
+ "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1604,34 +1616,35 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"b 7f\n"
"3:\n"
"udot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "zip2 z8.b, z14.b, z12.b\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
+ "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
- "addvl a_ptr1, a_ptr1, #1\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
"zip2 z11.b, z8.b, z9.b\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
"udot z16.s, z12.b, z0.b[1]\n"
@@ -2242,19 +2255,20 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"1:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"cbz %[regs], 3f\n"
"udot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
@@ -2262,13 +2276,12 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z25.s, z9.b, z2.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
@@ -2733,16 +2746,18 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"b 7f\n"
"3:\n"
"udot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
"udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p6/z, [a_ptr2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
@@ -2752,19 +2767,18 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"addvl a_ptr1, a_ptr1, #1\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z25.s, z9.b, z2.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z10.b, z1.b[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
"udot z27.s, z11.b, z2.b[0]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -3469,25 +3483,25 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"1:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"cbz %[regs], 3f\n"
"udot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
"udot z28.s, z8.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z9.b, z0.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
@@ -4023,38 +4037,39 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"b 7f\n"
"3:\n"
"udot z16.s, z8.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
"udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
"udot z28.s, z8.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z9.b, z0.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
- "addvl a_ptr1, a_ptr1, #1\n"
+ "addvl %[a_ptr0], %[a_ptr0], #1\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"udot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z25.s, z9.b, z2.b[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
+ "addvl a_ptr1, a_ptr1, #1\n"
"udot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z22.s, z10.b, z1.b[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
+ "addvl a_ptr2, a_ptr2, #1\n"
"udot z26.s, z10.b, z2.b[0]\n"
+ "addvl a_ptr3, a_ptr3, #1\n"
"udot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp
index 9bee502236..6b070d6d71 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -77,7 +77,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx8;
- smallK_hybrid_fp32_mla_1VLx8(const CPUInfo *ci) { UNUSED(ci); }
+ smallK_hybrid_fp32_mla_1VLx8(const CPUInfo *ci)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp
index fc18cbdbbf..9bc0969bf2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp
@@ -77,7 +77,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_smallK_hybrid_s8s32_dot_1VLx8;
- smallK_hybrid_s8s32_dot_1VLx8(const CPUInfo *ci) { UNUSED(ci); }
+ smallK_hybrid_s8s32_dot_1VLx8(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp
index 51d3e736ed..cc27c13533 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp
@@ -77,7 +77,10 @@ public:
// Default to the generic kernel
kern_type kernel=sve_smallK_hybrid_u8u32_dot_1VLx8;
- smallK_hybrid_u8u32_dot_1VLx8(const CPUInfo *ci) { UNUSED(ci); }
+ smallK_hybrid_u8u32_dot_1VLx8(const CPUInfo *)
+ {
+
+ }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp
index 5d8eae4866..a81d4504ae 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -1130,11 +1130,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
}
else
{
- const __fp16 *biasptr = nullbias;
- if (bias)
- {
- biasptr = bias + i;
- }
+ const __fp16 *biasptr = bias ? bias + i : nullbias;
switch(height)
{
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp
index 088353e5f3..284f2dc1a0 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,8 +30,8 @@ void MergeResults<12, 8, false>(float *out, const float *in, const int ldout, co
{
const float *inptr = in;
float nullbias[12];
- float minval = - std::numeric_limits<float>::infinity();
- float maxval = std::numeric_limits<float>::infinity();
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
switch(act.type)
{
@@ -1106,11 +1106,7 @@ void MergeResults<12, 8, false>(float *out, const float *in, const int ldout, co
}
else
{
- const float *biasptr = nullbias;
- if (bias)
- {
- biasptr = bias + i;
- }
+ const float *biasptr = bias ? bias + i : nullbias;
switch(height)
{
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp
index 2e45d8b5d1..fcf08e4e15 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,10 +26,8 @@
#ifdef __aarch64__
template<>
-void MergeResults<12, 8, false>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation act, bool append)
+void MergeResults<12, 8, false>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation , bool append)
{
- UNUSED(act);
-
const int32_t *inptr = in;
int32_t nullbias[12];
@@ -862,11 +860,7 @@ void MergeResults<12, 8, false>(int32_t *out, const int32_t *in, const int ldout
}
else
{
- const int32_t *biasptr = nullbias;
- if (bias)
- {
- biasptr = bias + i;
- }
+ const int32_t *biasptr = bias ? bias + i : nullbias;
switch(height)
{
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp
index 6d869af803..88eaa5f07c 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,10 +26,8 @@
#ifdef __aarch64__
template<>
-void MergeResults<4, 4, false>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation act, bool append)
+void MergeResults<4, 4, false>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation , bool append)
{
- UNUSED(act);
-
const int32_t *inptr = in;
int32_t nullbias[4];
@@ -240,11 +238,7 @@ void MergeResults<4, 4, false>(int32_t *out, const int32_t *in, const int ldout,
}
else
{
- const int32_t *biasptr = nullbias;
- if (bias)
- {
- biasptr = bias + i;
- }
+ const int32_t *biasptr = bias ? bias + i : nullbias;
switch(height)
{
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp
index 0a05944102..adc02f19eb 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,10 +26,8 @@
#ifdef __aarch64__
template<>
-void MergeResults<12, 8, false>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation act, bool append)
+void MergeResults<12, 8, false>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation , bool append)
{
- UNUSED(act);
-
const uint32_t *inptr = in;
uint32_t nullbias[12];
@@ -862,11 +860,7 @@ void MergeResults<12, 8, false>(uint32_t *out, const uint32_t *in, const int ldo
}
else
{
- const uint32_t *biasptr = nullbias;
- if (bias)
- {
- biasptr = bias + i;
- }
+ const uint32_t *biasptr = bias ? bias + i : nullbias;
switch(height)
{
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp
index efb17dc9e9..32e1eebaa4 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,10 +26,8 @@
#ifdef __aarch64__
template<>
-void MergeResults<4, 4, false>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation act, bool append)
+void MergeResults<4, 4, false>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation , bool append)
{
- UNUSED(act);
-
const uint32_t *inptr = in;
uint32_t nullbias[4];
@@ -240,11 +238,7 @@ void MergeResults<4, 4, false>(uint32_t *out, const uint32_t *in, const int ldou
}
else
{
- const uint32_t *biasptr = nullbias;
- if (bias)
- {
- biasptr = bias + i;
- }
+ const uint32_t *biasptr = bias ? bias + i : nullbias;
switch(height)
{
diff --git a/src/core/NEON/kernels/arm_gemm/merges/list.hpp b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
index 4edb497967..825c2fd020 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
index a44ef55a86..cf1d10329b 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -1010,11 +1010,7 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
}
else
{
- const __fp16 *biasptr = nullbias;
- if (bias)
- {
- biasptr = bias + i;
- }
+ const __fp16 *biasptr = bias ? bias + i : nullbias;
switch(height)
{
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
index bb073e4868..b0d10c085d 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -1010,11 +1010,7 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
}
else
{
- const float *biasptr = nullbias;
- if (bias)
- {
- biasptr = bias + i;
- }
+ const float *biasptr = bias ? bias + i : nullbias;
switch(height)
{
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
index d4c5073f8d..34b6fe3ef5 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,13 +26,12 @@
#ifdef __ARM_FEATURE_SVE
template<>
-void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation act, bool append)
+void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation , bool append)
{
- UNUSED(act);
-
const int32_t *inptr = in;
int32_t nullbias[192];
+
if (!append && !bias)
{
memset(nullbias, 0, (3 * get_vector_length<int32_t>() * sizeof(int32_t)));
@@ -765,11 +764,7 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
}
else
{
- const int32_t *biasptr = nullbias;
- if (bias)
- {
- biasptr = bias + i;
- }
+ const int32_t *biasptr = bias ? bias + i : nullbias;
switch(height)
{
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
index f2a28fa004..c4b2bb56d6 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,13 +26,12 @@
#ifdef __ARM_FEATURE_SVE
template<>
-void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation act, bool append)
+void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation , bool append)
{
- UNUSED(act);
-
const uint32_t *inptr = in;
uint32_t nullbias[192];
+
if (!append && !bias)
{
memset(nullbias, 0, (3 * get_vector_length<uint32_t>() * sizeof(uint32_t)));
@@ -765,11 +764,7 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
}
else
{
- const uint32_t *biasptr = nullbias;
- if (bias)
- {
- biasptr = bias + i;
- }
+ const uint32_t *biasptr = bias ? bias + i : nullbias;
switch(height)
{
diff --git a/src/core/NEON/kernels/arm_gemm/misc.cpp b/src/core/NEON/kernels/arm_gemm/misc.cpp
index 6758a88c65..a1892dc8d5 100644
--- a/src/core/NEON/kernels/arm_gemm/misc.cpp
+++ b/src/core/NEON/kernels/arm_gemm/misc.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,6 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+
#ifndef NO_MULTI_THREADING
#include <mutex>
#endif
diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
index 995716575a..5c58c585d7 100644
--- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -149,7 +149,7 @@ public:
}
ndrange_t get_window_size() const override {
- return _subgemm->get_window_size();
+ return { _subgemm->get_window_size() };
}
void set_nthreads(int nthreads) override {
@@ -158,8 +158,7 @@ public:
_args._maxthreads = nthreads;
}
- // Execute
- void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+ void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) override {
_subgemm->execute(work_range, thread_locator, threadid);
if (!_args._pretransposed_hint) {
col_sums_runtime(threadid);
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp
index 53e5527a8d..fbf49c8a31 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.cpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp
@@ -24,6 +24,7 @@
#ifdef __aarch64__
#include "arm_gemm.hpp"
+#include "utils.hpp"
#include <arm_neon.h>
@@ -283,7 +284,6 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
v_mul0=v_mul;
v_shf0=v_shift;
}
-
// Load column pointers
int32x4_t v_col0 = vld1q_s32(colptr);
colptr += 4;
@@ -604,7 +604,6 @@ namespace {
* that the terms can simply be added in the requantize code.
* */
switch (rows) {
- default:
case 1:
/* If we only have one output, just use ADDV. Multiply
* the offset into all four components separately so it
@@ -646,6 +645,9 @@ namespace {
vst1q_s32(row_bias, t0);
break;
+ default:
+ UNREACHABLE("Impossible.");
+
}
}
@@ -836,7 +838,6 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
if (numcols==16) {
switch(numrows) {
- default:
case 1:
add_block<1>(input + row * in_stride + col, in_stride, col_bias + col);
break;
@@ -852,6 +853,9 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
case 4:
add_block<4>(input + row * in_stride + col, in_stride, col_bias + col);
break;
+
+ default:
+ UNREACHABLE("Impossible.");
}
} else {
for (; col<width; col++) {
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
index c3c1e8d930..4b838c82a1 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@ class StdTransformsFixed
public:
template<typename TIn>
void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
- const int ymax, const int k0, const int kmax, bool transposed) {
+ const int ymax, const int k0, const int kmax, bool transposed) const {
if (transposed) {
Transform<height, block, true>(out, in, stride, y0, ymax, k0, kmax);
} else {
@@ -55,7 +55,7 @@ public:
template<typename TIn>
void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
- const int xmax, const int k0, const int kmax, bool transposed) {
+ const int xmax, const int k0, const int kmax, bool transposed) const {
if (transposed) {
Transform<width, block, false>(out, in, stride, x0, xmax, k0, kmax);
} else {
@@ -64,7 +64,7 @@ public:
}
template<typename TOut>
- void Merge(TOut *out, const TResult *in, int stride, int y0, int ymax, int x0, int xmax, const TOut *bias, const Activation act, bool append) {
+ void Merge(TOut *out, const TResult *in, int stride, int y0, int ymax, int x0, int xmax, const TOut *bias, const Activation act, bool append) const {
MergeResults<width, height>(out, in, stride, y0, ymax, x0, xmax, bias, act, append);
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index bdae90300b..c6ea079882 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
index 543664bb0e..5e5f65183c 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -59,7 +59,6 @@ inline void TransformImpl<6, 1, false, 4, 4, false>::Transform(T *out, const T *
/* 'first' forces this to always run at least once, needed if the total size is <=7. */
if ((y + 5) >= ymax) {
switch ((y + 5) - ymax) {
- /* Everything falls through in here */
case 4:
inptr1 = zerobuff;
// fall through
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
index 6b742c8776..9b6f4de543 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,8 +38,8 @@ void TransformImpl<4, 16, false, 1, 1, false>::Transform(T *out, const T *in, in
uint8_t zerobuff[16] = { 0 };
- for (uint64_t y = y0 ; y < static_cast<uint64_t>(ymax) ; y+=4) {
- const uint8_t *inptr0 = inptr + y * ldin + k0;
+ for (int y=y0; y<ymax; y+=4) {
+ const uint8_t *inptr0 = inptr + static_cast<intptr_t>(y) * ldin + k0;
const uint8_t *inptr1 = inptr0 + ldin;
const uint8_t *inptr2 = inptr1 + ldin;
const uint8_t *inptr3 = inptr2 + ldin;
@@ -52,9 +52,8 @@ void TransformImpl<4, 16, false, 1, 1, false>::Transform(T *out, const T *in, in
int x=(kmax-k0);
for (;x>15;x-=16) {
/* Cope with ragged cases by copying from a buffer of zeroes instead */
- if ((y + 3) >= static_cast<uint64_t>(ymax)) {
+ if ((y + 3) >= ymax) {
switch ((y + 3) - ymax) {
- /* Everything falls through in here */
case 2:
inptr1 = zerobuff;
// fall through
@@ -90,9 +89,8 @@ void TransformImpl<4, 16, false, 1, 1, false>::Transform(T *out, const T *in, in
if (x>0) {
/* Need to duplicate this here, in case we didn't run the main loop. */
- if ((y + 3) >= static_cast<uint64_t>(ymax)) {
+ if ((y + 3) >= ymax) {
switch ((y + 3) - ymax) {
- /* Everything falls through in here */
case 2:
inptr1 = zerobuff;
// fall through
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
index 80dd6c5e25..3d912c4675 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -63,7 +63,6 @@ void TransformImpl<8, 1, false, 2, 2, false>::Transform(T *out, const T *in, int
/* 'first' forces this to always run at least once, needed if the total size is <=7. */
if ((y + 7) >= ymax) {
switch ((y + 7) - ymax) {
- /* Everything falls through in here */
case 6:
inptr1 = zerobuff;
// fall through
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
index 9dfc1346e6..701d688af2 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -63,7 +63,6 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *
/* 'first' forces this to always run at least once, needed if the total size is <=7. */
if ((y + 7) >= ymax) {
switch ((y + 7) - ymax) {
- /* Everything falls through in here */
case 6:
inptr1 = zerobuff;
// fall through
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp
index 2bc7801b15..2546cc571a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -60,7 +60,7 @@ inline void TransformImpl<8, 4, false, 1, 1, false>::Transform(T *out, const T *
}
};
- uint8_t zerobuff[64]; // 32 for asm loop plus up to 31 for overflow loop
+ uint8_t zerobuff[64] = { 0 }; // 32 for asm loop plus up to 31 for overflow loop
for (int y=y0; y<ymax; y+=8) {
const uint8_t *inptr0 = inptr + y * ldin + k0;
@@ -87,7 +87,6 @@ inline void TransformImpl<8, 4, false, 1, 1, false>::Transform(T *out, const T *
/* 'first' forces this to always run at least once, needed if the total size is <=32. */
if ((y + 7) >= ymax) {
switch ((y + 7) - ymax) {
- /* Everything falls through in here */
case 6:
inptr1 = zerobuff;
// fall through
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
index bde3274926..a342d6c3d1 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -63,7 +63,6 @@ inline void TransformImpl<8, 1, false, 4, 2, false>::Transform(float *out, const
/* 'first' forces this to always run at least once, needed if the total size is <=7. */
if ((y + 7) >= ymax) {
switch ((y + 7) - ymax) {
- /* Everything falls through in here */
case 6:
inptr1 = zerobuff;
// fall through
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
index 8992c1010d..d7de9ff934 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
index 6d627334cd..a137f9360a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index be66cd42ff..2c698b2576 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,6 +34,7 @@
#include "a64_transpose_interleave_8way_32bit.hpp"
#include "sve_interleave_8way_32bit.hpp"
#include "sve_interleave_8way_block2_16bit.hpp"
+#include "sve_interleave_8way_block2_32bit.hpp"
#include "sve_interleave_8way_block4_16bit.hpp"
#include "sve_interleave_8way_block4_8bit.hpp"
#include "sve_interleave_8way_block8_8bit.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
index 881dc7bb72..348d78e3f5 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
index 4cc4311cee..f21933b8de 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,12 +36,12 @@ inline void TransformImpl<8, 2, false, 4, 4, false>::Transform(T *out, const T *
{
const int height = ymax-y;
const long inwidth = (kmax - k0);
- const long outwidth = (inwidth * 8 + 1) / 2;
+ const long outwidth = ((inwidth + 1) / 2) * 16;
long inpos = 0;
long outpos = 0;
uint32_t *outptr = master_outptr;
- master_outptr += (outwidth * 2);
+ master_outptr += outwidth;
const uint32_t *inptr0 = inptr + y * ldin + k0;
const uint32_t *inptr1 = inptr0 + ldin;
@@ -60,571 +60,535 @@ inline void TransformImpl<8, 2, false, 4, 4, false>::Transform(T *out, const T *
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z4.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0]]\n"
- "zip1 z8.d, z0.d, z4.d\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
"incw %[inpos], all, mul #1\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z8.d, z0.d, z4.d\n"
"zip2 z9.d, z0.d, z4.d\n"
- "addvl %[inptr0], %[inptr0], #1\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip1 z0.d, z8.d, z4.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z1.d, z8.d, z4.d\n"
- "incd %[outpos], all, mul #1\n"
"zip1 z2.d, z9.d, z4.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
"zip2 z3.d, z9.d, z4.d\n"
- "incd %[outpos], all, mul #1\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip1 z8.d, z0.d, z4.d\n"
- "st1d z8.d, p0, [%[outptr]]\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z9.d, z0.d, z4.d\n"
- "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z10.d, z1.d, z4.d\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z11.d, z1.d, z4.d\n"
- "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip1 z12.d, z2.d, z4.d\n"
- "incd %[outpos], all, mul #1\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip2 z13.d, z2.d, z4.d\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z14.d, z3.d, z4.d\n"
- "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
"zip2 z15.d, z3.d, z4.d\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
- "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
- "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
- "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
- "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
-
+
case 2:
__asm __volatile(
"1:\n"
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z4.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "mov z14.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "incw %[inpos], all, mul #1\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip1 z8.d, z0.d, z4.d\n"
- "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z9.d, z0.d, z4.d\n"
- "incw %[inpos], all, mul #1\n"
"zip1 z10.d, z1.d, z4.d\n"
- "addvl %[inptr0], %[inptr0], #1\n"
"zip2 z11.d, z1.d, z4.d\n"
- "addvl %[inptr1], %[inptr1], #1\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip1 z0.d, z8.d, z4.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z1.d, z8.d, z4.d\n"
- "incd %[outpos], all, mul #1\n"
"zip1 z2.d, z9.d, z4.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
"zip2 z3.d, z9.d, z4.d\n"
- "incd %[outpos], all, mul #1\n"
- "mov z14.s, #0\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip1 z4.d, z10.d, z14.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z5.d, z10.d, z14.d\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
"zip1 z6.d, z11.d, z14.d\n"
- "incd %[outpos], all, mul #1\n"
"zip2 z7.d, z11.d, z14.d\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip1 z8.d, z0.d, z4.d\n"
- "st1d z8.d, p0, [%[outptr]]\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z9.d, z0.d, z4.d\n"
- "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z10.d, z1.d, z5.d\n"
- "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z11.d, z1.d, z5.d\n"
- "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
"zip1 z12.d, z2.d, z6.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip2 z13.d, z2.d, z6.d\n"
- "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z14.d, z3.d, z7.d\n"
- "incd %[outpos], all, mul #1\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
"zip2 z15.d, z3.d, z7.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
- "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
- "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
- "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
-
+
case 3:
__asm __volatile(
"1:\n"
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z4.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "mov z14.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+ "incw %[inpos], all, mul #1\n"
"zip1 z8.d, z0.d, z4.d\n"
- "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z9.d, z0.d, z4.d\n"
- "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z10.d, z1.d, z4.d\n"
- "incw %[inpos], all, mul #1\n"
"zip2 z11.d, z1.d, z4.d\n"
- "addvl %[inptr0], %[inptr0], #1\n"
"zip1 z12.d, z2.d, z4.d\n"
- "addvl %[inptr1], %[inptr1], #1\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z13.d, z2.d, z4.d\n"
- "addvl %[inptr2], %[inptr2], #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z0.d, z8.d, z12.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
"zip2 z1.d, z8.d, z12.d\n"
- "incd %[outpos], all, mul #1\n"
"zip1 z2.d, z9.d, z13.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z3.d, z9.d, z13.d\n"
- "incd %[outpos], all, mul #1\n"
- "mov z14.s, #0\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z4.d, z10.d, z14.d\n"
- "incd %[outpos], all, mul #1\n"
"zip2 z5.d, z10.d, z14.d\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
"zip1 z6.d, z11.d, z14.d\n"
- "incd %[outpos], all, mul #1\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip2 z7.d, z11.d, z14.d\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z8.d, z0.d, z4.d\n"
- "st1d z8.d, p0, [%[outptr]]\n"
"zip2 z9.d, z0.d, z4.d\n"
- "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z10.d, z1.d, z5.d\n"
- "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
"zip2 z11.d, z1.d, z5.d\n"
- "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip1 z12.d, z2.d, z6.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z13.d, z2.d, z6.d\n"
- "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z14.d, z3.d, z7.d\n"
- "incd %[outpos], all, mul #1\n"
"zip2 z15.d, z3.d, z7.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
- "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
- "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
- "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
-
+
case 4:
__asm __volatile(
"1:\n"
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z4.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+ "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+ "incw %[inpos], all, mul #1\n"
"zip1 z8.d, z0.d, z4.d\n"
- "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z9.d, z0.d, z4.d\n"
- "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z10.d, z1.d, z4.d\n"
- "ld1w z3.s, p0/z, [%[inptr3]]\n"
"zip2 z11.d, z1.d, z4.d\n"
- "incw %[inpos], all, mul #1\n"
"zip1 z12.d, z2.d, z4.d\n"
- "addvl %[inptr0], %[inptr0], #1\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z13.d, z2.d, z4.d\n"
- "addvl %[inptr1], %[inptr1], #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z14.d, z3.d, z4.d\n"
- "addvl %[inptr2], %[inptr2], #1\n"
"zip2 z15.d, z3.d, z4.d\n"
- "addvl %[inptr3], %[inptr3], #1\n"
"zip1 z0.d, z8.d, z12.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z1.d, z8.d, z12.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.d, z9.d, z13.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
"zip2 z3.d, z9.d, z13.d\n"
- "incd %[outpos], all, mul #1\n"
"zip1 z4.d, z10.d, z14.d\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip2 z5.d, z10.d, z14.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z6.d, z11.d, z15.d\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
"zip2 z7.d, z11.d, z15.d\n"
- "incd %[outpos], all, mul #1\n"
"zip1 z8.d, z0.d, z4.d\n"
- "st1d z8.d, p0, [%[outptr]]\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
"zip2 z9.d, z0.d, z4.d\n"
- "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z10.d, z1.d, z5.d\n"
- "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z11.d, z1.d, z5.d\n"
- "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
"zip1 z12.d, z2.d, z6.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
"zip2 z13.d, z2.d, z6.d\n"
- "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z14.d, z3.d, z7.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z15.d, z3.d, z7.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
- "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
- "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
- "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
-
+
case 5:
__asm __volatile(
"1:\n"
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z5.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0]]\n"
- "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+ "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+ "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
"incw %[inpos], all, mul #1\n"
"zip1 z10.d, z1.d, z5.d\n"
- "ld1w z2.s, p0/z, [%[inptr2]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "ld1w z3.s, p0/z, [%[inptr3]]\n"
- "zip1 z12.d, z2.d, z5.d\n"
- "ld1w z4.s, p0/z, [%[inptr4]]\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip1 z8.d, z0.d, z4.d\n"
- "addvl %[inptr0], %[inptr0], #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z9.d, z0.d, z4.d\n"
- "addvl %[inptr1], %[inptr1], #1\n"
+ "zip2 z11.d, z1.d, z5.d\n"
+ "zip1 z12.d, z2.d, z5.d\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z13.d, z2.d, z5.d\n"
- "addvl %[inptr2], %[inptr2], #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z14.d, z3.d, z5.d\n"
- "addvl %[inptr3], %[inptr3], #1\n"
"zip2 z15.d, z3.d, z5.d\n"
- "addvl %[inptr4], %[inptr4], #1\n"
"zip1 z0.d, z8.d, z12.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z1.d, z8.d, z12.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.d, z9.d, z13.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
"zip2 z3.d, z9.d, z13.d\n"
- "incd %[outpos], all, mul #1\n"
"zip1 z4.d, z10.d, z14.d\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip2 z5.d, z10.d, z14.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z6.d, z11.d, z15.d\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
"zip2 z7.d, z11.d, z15.d\n"
- "incd %[outpos], all, mul #1\n"
"zip1 z8.d, z0.d, z4.d\n"
- "st1d z8.d, p0, [%[outptr]]\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
"zip2 z9.d, z0.d, z4.d\n"
- "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z10.d, z1.d, z5.d\n"
- "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z11.d, z1.d, z5.d\n"
- "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
"zip1 z12.d, z2.d, z6.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
"zip2 z13.d, z2.d, z6.d\n"
- "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z14.d, z3.d, z7.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z15.d, z3.d, z7.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
- "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
- "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
- "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
-
+
case 6:
__asm __volatile(
"1:\n"
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z6.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0]]\n"
- "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+ "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+ "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+ "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
"incw %[inpos], all, mul #1\n"
- "ld1w z2.s, p0/z, [%[inptr2]]\n"
- "addvl %[inptr0], %[inptr0], #1\n"
"zip1 z12.d, z2.d, z6.d\n"
- "ld1w z3.s, p0/z, [%[inptr3]]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "ld1w z4.s, p0/z, [%[inptr4]]\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip1 z8.d, z0.d, z4.d\n"
- "ld1w z5.s, p0/z, [%[inptr5]]\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z9.d, z0.d, z4.d\n"
- "addvl %[inptr1], %[inptr1], #1\n"
"zip1 z10.d, z1.d, z5.d\n"
- "addvl %[inptr2], %[inptr2], #1\n"
"zip2 z11.d, z1.d, z5.d\n"
- "addvl %[inptr3], %[inptr3], #1\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.d, z2.d, z6.d\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z14.d, z3.d, z6.d\n"
- "addvl %[inptr4], %[inptr4], #1\n"
"zip2 z15.d, z3.d, z6.d\n"
- "addvl %[inptr5], %[inptr5], #1\n"
"zip1 z0.d, z8.d, z12.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z1.d, z8.d, z12.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.d, z9.d, z13.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
"zip2 z3.d, z9.d, z13.d\n"
- "incd %[outpos], all, mul #1\n"
"zip1 z4.d, z10.d, z14.d\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip2 z5.d, z10.d, z14.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z6.d, z11.d, z15.d\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
"zip2 z7.d, z11.d, z15.d\n"
- "incd %[outpos], all, mul #1\n"
"zip1 z8.d, z0.d, z4.d\n"
- "st1d z8.d, p0, [%[outptr]]\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
"zip2 z9.d, z0.d, z4.d\n"
- "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z10.d, z1.d, z5.d\n"
- "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z11.d, z1.d, z5.d\n"
- "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
"zip1 z12.d, z2.d, z6.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
"zip2 z13.d, z2.d, z6.d\n"
- "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z14.d, z3.d, z7.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z15.d, z3.d, z7.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
- "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
- "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
- "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
-
+
case 7:
__asm __volatile(
"1:\n"
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z7.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0]]\n"
- "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+ "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+ "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+ "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
+ "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
"incw %[inpos], all, mul #1\n"
- "ld1w z2.s, p0/z, [%[inptr2]]\n"
- "addvl %[inptr0], %[inptr0], #1\n"
- "ld1w z3.s, p0/z, [%[inptr3]]\n"
- "addvl %[inptr1], %[inptr1], #1\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "ld1w z4.s, p0/z, [%[inptr4]]\n"
"zip1 z8.d, z0.d, z4.d\n"
- "ld1w z5.s, p0/z, [%[inptr5]]\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z9.d, z0.d, z4.d\n"
- "ld1w z6.s, p0/z, [%[inptr6]]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z10.d, z1.d, z5.d\n"
- "addvl %[inptr2], %[inptr2], #1\n"
"zip2 z11.d, z1.d, z5.d\n"
- "addvl %[inptr3], %[inptr3], #1\n"
"zip1 z12.d, z2.d, z6.d\n"
- "addvl %[inptr4], %[inptr4], #1\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z13.d, z2.d, z6.d\n"
- "addvl %[inptr5], %[inptr5], #1\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z14.d, z3.d, z7.d\n"
"zip2 z15.d, z3.d, z7.d\n"
- "addvl %[inptr6], %[inptr6], #1\n"
"zip1 z0.d, z8.d, z12.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z1.d, z8.d, z12.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.d, z9.d, z13.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
"zip2 z3.d, z9.d, z13.d\n"
- "incd %[outpos], all, mul #1\n"
"zip1 z4.d, z10.d, z14.d\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip2 z5.d, z10.d, z14.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z6.d, z11.d, z15.d\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
"zip2 z7.d, z11.d, z15.d\n"
- "incd %[outpos], all, mul #1\n"
"zip1 z8.d, z0.d, z4.d\n"
- "st1d z8.d, p0, [%[outptr]]\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
"zip2 z9.d, z0.d, z4.d\n"
- "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z10.d, z1.d, z5.d\n"
- "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z11.d, z1.d, z5.d\n"
- "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
"zip1 z12.d, z2.d, z6.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
"zip2 z13.d, z2.d, z6.d\n"
- "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z14.d, z3.d, z7.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z15.d, z3.d, z7.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
- "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
- "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
- "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
-
+
default:
case 8:
__asm __volatile(
"1:\n"
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
- "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+ "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+ "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+ "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
+ "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
+ "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
"incw %[inpos], all, mul #1\n"
- "ld1w z1.s, p0/z, [%[inptr1]]\n"
- "addvl %[inptr0], %[inptr0], #1\n"
- "ld1w z2.s, p0/z, [%[inptr2]]\n"
- "addvl %[inptr1], %[inptr1], #1\n"
- "ld1w z3.s, p0/z, [%[inptr3]]\n"
- "addvl %[inptr2], %[inptr2], #1\n"
- "ld1w z4.s, p0/z, [%[inptr4]]\n"
- "addvl %[inptr3], %[inptr3], #1\n"
"zip1 z8.d, z0.d, z4.d\n"
- "ld1w z5.s, p0/z, [%[inptr5]]\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z9.d, z0.d, z4.d\n"
- "ld1w z6.s, p0/z, [%[inptr6]]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z10.d, z1.d, z5.d\n"
- "ld1w z7.s, p0/z, [%[inptr7]]\n"
"zip2 z11.d, z1.d, z5.d\n"
- "addvl %[inptr4], %[inptr4], #1\n"
"zip1 z12.d, z2.d, z6.d\n"
- "addvl %[inptr5], %[inptr5], #1\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z13.d, z2.d, z6.d\n"
- "addvl %[inptr6], %[inptr6], #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z14.d, z3.d, z7.d\n"
- "addvl %[inptr7], %[inptr7], #1\n"
"zip2 z15.d, z3.d, z7.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
"zip1 z0.d, z8.d, z12.d\n"
- "incd %[outpos], all, mul #1\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z1.d, z8.d, z12.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.d, z9.d, z13.d\n"
- "incd %[outpos], all, mul #1\n"
"zip2 z3.d, z9.d, z13.d\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
"zip1 z4.d, z10.d, z14.d\n"
- "incd %[outpos], all, mul #1\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip2 z5.d, z10.d, z14.d\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z6.d, z11.d, z15.d\n"
- "incd %[outpos], all, mul #1\n"
"zip2 z7.d, z11.d, z15.d\n"
"zip1 z8.d, z0.d, z4.d\n"
- "st1d z8.d, p0, [%[outptr]]\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
"zip2 z9.d, z0.d, z4.d\n"
- "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z10.d, z1.d, z5.d\n"
- "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z11.d, z1.d, z5.d\n"
- "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
"zip1 z12.d, z2.d, z6.d\n"
- "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
"zip2 z13.d, z2.d, z6.d\n"
- "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z14.d, z3.d, z7.d\n"
- "incd %[outpos], all, mul #1\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z15.d, z3.d, z7.d\n"
- "whilelt p1.d, %[outpos], %[outwidth]\n"
- "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p2.d, %[outpos], %[outwidth]\n"
- "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
- "whilelt p3.d, %[outpos], %[outwidth]\n"
- "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
- "incd %[outpos], all, mul #1\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
-
-
+
+
}
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
index a96a43cbeb..ed0d58aa91 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 - 2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index 7dbbe91ba2..c49db2cb16 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,8 +32,6 @@
// Paranoid option for the above with assert
// #define UNREACHABLE(why) assert(0 && why)
-#define UNUSED(x) (void)(x)
-
template<typename T>
inline T iceildiv(const T a, const T b) {
return (a + b - 1) / b;
diff --git a/src/core/NEON/kernels/assembly/arm_gemm.hpp b/src/core/NEON/kernels/assembly/arm_gemm.hpp
index 7723224ec8..2df7132500 100644
--- a/src/core/NEON/kernels/assembly/arm_gemm.hpp
+++ b/src/core/NEON/kernels/assembly/arm_gemm.hpp
@@ -23,14 +23,14 @@
*/
#pragma once
-#include <memory>
#include <cstring>
+#include <memory>
#include "arm_gemm_local.hpp"
#include "gemm_common.hpp"
-namespace arm_gemm {
-
+namespace arm_gemm
+{
enum class GemmMethod
{
DEFAULT,
@@ -47,12 +47,17 @@ enum class GemmMethod
struct KernelDescription
{
- GemmMethod method = GemmMethod::DEFAULT;
- std::string name = "";
- bool is_default = false;
+ GemmMethod method = GemmMethod::DEFAULT;
+ std::string name = "";
+ bool is_default = false;
- KernelDescription(GemmMethod m, std::string n, bool d=false) : method(m), name(n), is_default(d) { }
- KernelDescription() noexcept { }
+ KernelDescription(GemmMethod m, std::string n, bool d = false)
+ : method(m), name(n), is_default(d)
+ {
+ }
+ KernelDescription() noexcept
+ {
+ }
};
struct GemmConfig
@@ -62,23 +67,32 @@ struct GemmConfig
unsigned int inner_block_size = 0;
unsigned int outer_block_size = 0;
- GemmConfig(GemmMethod method) : method(method) { }
- GemmConfig() { }
+ GemmConfig(GemmMethod method)
+ : method(method)
+ {
+ }
+ GemmConfig()
+ {
+ }
};
struct Activation
{
- enum class Type {
+ enum class Type
+ {
None,
ReLU,
BoundedReLU
};
- Type type;
- float param1;
- float param2;
+ Type type;
+ float param1;
+ float param2;
- Activation(Type type=Type::None, float p1=0.0f, float p2=0.0f) : type(type), param1(p1), param2(p2) { }
+ Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f)
+ : type(type), param1(p1), param2(p2)
+ {
+ }
};
struct GemmArgs
@@ -101,10 +115,8 @@ public:
const unsigned int K, const unsigned int nbatches,
const unsigned int nmulti, const bool trA, const bool trB,
Activation act, const int maxthreads,
- const bool pretransposed_hint, const GemmConfig *cfg=nullptr ) :
- _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti),
- _trA(trA), _trB(trB), _act(act), _maxthreads(maxthreads),
- _pretransposed_hint(pretransposed_hint), _cfg(cfg)
+ const bool pretransposed_hint, const GemmConfig *cfg = nullptr)
+ : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), _trA(trA), _trB(trB), _act(act), _maxthreads(maxthreads), _pretransposed_hint(pretransposed_hint), _cfg(cfg)
{
}
};
@@ -112,18 +124,18 @@ public:
struct Requantize32
{
public:
- const int32_t *bias = nullptr;
- size_t bias_multi_stride = 0;
- int32_t a_offset = 0;
- int32_t b_offset = 0;
- int32_t c_offset = 0;
- bool per_channel_requant = false;
- int32_t per_layer_shift = 0;
- int32_t per_layer_mul = 0;
- const int32_t *per_channel_shifts = nullptr;
- const int32_t *per_channel_muls = nullptr;
- int32_t minval = 0;
- int32_t maxval = 0;
+ const int32_t *bias = nullptr;
+ size_t bias_multi_stride = 0;
+ int32_t a_offset = 0;
+ int32_t b_offset = 0;
+ int32_t c_offset = 0;
+ bool per_channel_requant = false;
+ int32_t per_layer_shift = 0;
+ int32_t per_layer_mul = 0;
+ const int32_t *per_channel_shifts = nullptr;
+ const int32_t *per_channel_muls = nullptr;
+ int32_t minval = 0;
+ int32_t maxval = 0;
Requantize32() = default;
@@ -131,11 +143,9 @@ public:
Requantize32(const int32_t *bias, size_t bias_multi_stride,
int32_t a_offset, int32_t b_offset, int32_t c_offset,
int32_t requant_shift, int32_t requant_mul,
- int32_t minv, int32_t maxv) :
- bias(bias), bias_multi_stride(bias_multi_stride),
- a_offset(a_offset), b_offset(b_offset), c_offset(c_offset),
- per_channel_requant(false), per_layer_shift(requant_shift), per_layer_mul(requant_mul),
- minval(minv), maxval(maxv)
+ int32_t minv, int32_t maxv)
+ : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_shift(requant_shift), per_layer_mul(requant_mul),
+ minval(minv), maxval(maxv)
{
}
@@ -143,11 +153,9 @@ public:
Requantize32(const int32_t *bias, size_t bias_multi_stride,
int32_t a_offset, int32_t b_offset, int32_t c_offset,
const int32_t *requant_shifts, const int32_t *requant_muls,
- int32_t minv, int32_t maxv) :
- bias(bias), bias_multi_stride(bias_multi_stride),
- a_offset(a_offset), b_offset(b_offset), c_offset(c_offset),
- per_channel_requant(true), per_channel_shifts(requant_shifts), per_channel_muls(requant_muls),
- minval(minv), maxval(maxv)
+ int32_t minv, int32_t maxv)
+ : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_shifts(requant_shifts),
+ per_channel_muls(requant_muls), minval(minv), maxval(maxv)
{
}
};
@@ -156,21 +164,21 @@ struct Nothing
{
};
-template<typename Top, typename Tret>
-using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret> >;
+template <typename Top, typename Tret>
+using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret>>;
/* Low level API calls.
* These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */
/* get_gemm_method(): Given the templated types and provided parameters,
* which is the preferred method to implement this GEMM? */
-template<typename Top, typename Tret, class OutputStage = Nothing>
-KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & ={});
+template <typename Top, typename Tret, class OutputStage = Nothing>
+KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {});
-template<typename Top, typename Tret, class OutputStage = Nothing>
-UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & ={});
+template <typename Top, typename Tret, class OutputStage = Nothing>
+UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & = {});
-template<typename Top, typename Tret, class OutputStage = Nothing>
-std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & ={});
+template <typename Top, typename Tret, class OutputStage = Nothing>
+std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & = {});
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/assembly/gemm_common.hpp b/src/core/NEON/kernels/assembly/gemm_common.hpp
index a44b774b9d..3b4c025371 100644
--- a/src/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/src/core/NEON/kernels/assembly/gemm_common.hpp
@@ -23,15 +23,12 @@
*/
#pragma once
-#include "arm_gemm_compute_iface.hpp"
+#include "ndrange.hpp"
#include <cstddef>
-#include <cassert>
-
-#define UNUSED(x) (void)(x)
-
-namespace arm_gemm {
+namespace arm_gemm
+{
// Abstract class for the GEMM/GEMV functions.
//
// GEMM implementations may be "native" (never require any input
@@ -41,7 +38,8 @@ namespace arm_gemm {
// The real GemmCommon class is templated based on the operand and return
// type. This is an interface class which is independent of those types.
-class IGemmCommon {
+class IGemmCommon
+{
public:
/* Pass in the pointers to the arrays to be operated on and their
* strides. This "generic" version uses void *s, the preferred version
@@ -50,9 +48,9 @@ public:
* the settings for B here are ignored.
*/
virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
- void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
+ const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
+ void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
+ const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
/** @returns an ndrange containing ranges of the compute space which can be
* broken up and parallelised over
@@ -71,47 +69,64 @@ public:
* This has an empty default implementation, as GEMMs which don't care
* about thread count can safely ignore this.
*/
- virtual void set_nthreads(int) { };
+ virtual void set_nthreads(int) {};
/* Whether this GEMM can be dynamically scheduled or not. */
- virtual bool supports_dynamic_scheduling() const { return false; }
+ virtual bool supports_dynamic_scheduling() const
+ {
+ return false;
+ }
/** Main execute member fucntion
* @param [in] work_range specifies the range of work we want to be computed, total range defined by get_window_size()
* @param [in] thread_locator where are we inside of the thread space
* @naram [in] threadid a unique threadid
*/
- virtual void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) = 0;
+ virtual void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) = 0;
/*** Working space interface (optional) ***/
/* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */
- virtual size_t get_working_size() const { return 0; }
+ virtual size_t get_working_size() const
+ {
+ return 0;
+ }
/* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
- virtual void set_working_space(void *) { };
+ virtual void set_working_space(void *) {};
/*** "Pretransposed" interface (optional) ***/
/* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */
- virtual bool B_is_pretransposed() const { return false; }
+ virtual bool B_is_pretransposed() const
+ {
+ return false;
+ }
/* Does pretranspose still need to be done? */
- virtual bool B_pretranspose_required() const { return false; }
+ virtual bool B_pretranspose_required() const
+ {
+ return false;
+ }
/* Total number of bytes of space needed for pretransposed arrays. */
- virtual size_t get_B_pretransposed_array_size() const { return 0; }
+ virtual size_t get_B_pretransposed_array_size() const
+ {
+ return 0;
+ }
/* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */
/* The "real" version of this depends on the templated operand type (see below). */
virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
/* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
- virtual void set_pretransposed_B_data(void *) { }
+ virtual void set_pretransposed_B_data(void *)
+ {
+ }
/*** "Quantized bias" interface (optional) ***/
/* Set the bias vector for quantized GEMMs */
- virtual void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride)
+ virtual void set_quantized_bias(const int32_t *, size_t)
{
- UNUSED(bias);
- UNUSED(bias_multi_stride);
}
// Destructor
- virtual ~IGemmCommon() { }
+ virtual ~IGemmCommon()
+ {
+ }
};
/* "Real" GemmCommon class which is templated on the operand and return types.
@@ -121,50 +136,53 @@ public:
* 'set_arrays' to capture the provided arguments in protected class
* members, as essentially any implementation will need these.
*/
-template<typename To, typename Tr>
-class GemmCommon : public IGemmCommon {
+template <typename To, typename Tr>
+class GemmCommon : public IGemmCommon
+{
protected:
- const To *_Aptr=nullptr;
- int _lda=0;
- int _A_batch_stride=0;
- int _A_multi_stride=0;
- const To *_Bptr=nullptr;
- int _ldb=0;
- int _B_multi_stride=0;
- Tr *_Cptr=nullptr;
- int _ldc=0;
- int _C_batch_stride=0;
- int _C_multi_stride=0;
- const Tr *_bias=nullptr;
- int _bias_multi_stride=0;
+ const To *_Aptr = nullptr;
+ int _lda = 0;
+ int _A_batch_stride = 0;
+ int _A_multi_stride = 0;
+ const To *_Bptr = nullptr;
+ int _ldb = 0;
+ int _B_multi_stride = 0;
+ Tr *_Cptr = nullptr;
+ int _ldc = 0;
+ int _C_batch_stride = 0;
+ int _C_multi_stride = 0;
+ const Tr *_bias = nullptr;
+ int _bias_multi_stride = 0;
public:
/* Pass in the pointers to the arrays to be operated on and their
* strides (templated version with appropriate types). */
virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
- Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride) {
- _Aptr = A;
- _lda = lda;
- _A_batch_stride = A_batch_stride;
- _A_multi_stride = A_multi_stride;
- _Bptr = B;
- _ldb = ldb;
- _B_multi_stride = B_multi_stride;
- _Cptr = C;
- _ldc = ldc;
- _C_batch_stride = C_batch_stride;
- _C_multi_stride = C_multi_stride;
- _bias = bias;
+ const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
+ Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
+ const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride)
+ {
+ _Aptr = A;
+ _lda = lda;
+ _A_batch_stride = A_batch_stride;
+ _A_multi_stride = A_multi_stride;
+ _Bptr = B;
+ _ldb = ldb;
+ _B_multi_stride = B_multi_stride;
+ _Cptr = C;
+ _ldc = ldc;
+ _C_batch_stride = C_batch_stride;
+ _C_multi_stride = C_multi_stride;
+ _bias = bias;
_bias_multi_stride = bias_multi_stride;
}
/* Implementation of the void * overload which casts its arguments to the appropriate type. */
void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
- void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override {
+ const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
+ void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
+ const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override
+ {
set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
static_cast<const To *>(B), ldb, B_multi_stride,
static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
@@ -175,27 +193,13 @@ public:
/* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
/* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
- virtual void pretranspose_B_array(void *, const To *, const int, const int) { };
+ virtual void pretranspose_B_array(void *, const To *, const int, const int) {};
/* Implementation of the void * overload which casts its arguments to the appropriate type. */
- void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override {
+ void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override
+ {
pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
}
};
-template<typename GemmKernel>
-inline
-int unsigned get_total_window_size(const GemmKernel& kernel)
-{
- auto window=kernel.get_window_size();
-
- unsigned int total = 1;
- for(unsigned i = 0; i != arm_gemm::ndrange_max; ++i)
- {
- total *= window.get_size(i);
- }
-
- return total;
-}
-
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/assembly/ndrange.hpp b/src/core/NEON/kernels/assembly/ndrange.hpp
index d082a3e9b8..86638298ab 100644
--- a/src/core/NEON/kernels/assembly/ndrange.hpp
+++ b/src/core/NEON/kernels/assembly/ndrange.hpp
@@ -23,104 +23,123 @@
*/
#pragma once
-#include <array>
#include <algorithm>
-#include <initializer_list>
-
+#include <array>
#include <cassert>
+#include <initializer_list>
-namespace arm_gemm {
-
-template<unsigned int D>
-class NDRange {
+namespace arm_gemm
+{
+template <unsigned int D>
+class NDRange
+{
private:
- std::array<unsigned int, D> m_sizes {};
- std::array<unsigned int, D> m_totalsizes {};
+ std::array<unsigned int, D> m_sizes{};
+ std::array<unsigned int, D> m_totalsizes{};
- class NDRangeIterator {
+ class NDRangeIterator
+ {
private:
const NDRange &m_parent;
- unsigned int m_pos = 0;
- unsigned int m_end = 0;
+ unsigned int m_pos = 0;
+ unsigned int m_end = 0;
public:
- NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { }
+ NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e)
+ : m_parent(p), m_pos(s), m_end(e)
+ {
+ }
- bool done() const {
+ bool done() const
+ {
return (m_pos >= m_end);
}
- unsigned int dim(unsigned int d) const {
+ unsigned int dim(unsigned int d) const
+ {
unsigned int r = m_pos;
- if (d < (D - 1)) {
+ if(d < (D - 1))
+ {
r %= m_parent.m_totalsizes[d];
}
- if (d > 0) {
- r /= m_parent.m_totalsizes[d-1];
+ if(d > 0)
+ {
+ r /= m_parent.m_totalsizes[d - 1];
}
return r;
}
- bool next_dim0() {
+ bool next_dim0()
+ {
m_pos++;
return !done();
}
- bool next_dim1() {
+ bool next_dim1()
+ {
m_pos += m_parent.m_sizes[0] - dim(0);
return !done();
}
- unsigned int dim0_max() const {
+ unsigned int dim0_max() const
+ {
unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0));
return dim(0) + offset;
}
};
-public:
- NDRange& operator=(const NDRange& rhs)=default;
- NDRange(const NDRange& rhs) =default;
-
- template <typename... T>
- NDRange(T... ts)
- : m_sizes{ts...}
+ void set_totalsizes()
{
- unsigned int t=1;
+ unsigned int t = 1;
+
+ for(unsigned int i = 0; i < D; i++)
+ {
+ if(m_sizes[i] == 0)
+ {
+ m_sizes[i] = 1;
+ }
- for (unsigned int i=0; i<D; i++) {
t *= m_sizes[i];
m_totalsizes[i] = t;
}
}
- NDRange(const std::array<unsigned int, D>& n)
- : m_sizes(n)
- {
- unsigned int t=1;
+public:
+ NDRange &operator=(const NDRange &rhs) = default;
+ NDRange(const NDRange &rhs) = default;
- for (unsigned int i=0; i<D; i++) {
- t *= m_sizes[i];
+ template <typename... T>
+ NDRange(T... ts)
+ : m_sizes{ ts... }
+ {
+ set_totalsizes();
+ }
- m_totalsizes[i] = t;
- }
+ NDRange(const std::array<unsigned int, D> &n)
+ : m_sizes(n)
+ {
+ set_totalsizes();
}
- NDRangeIterator iterator(unsigned int start, unsigned int end) const {
+ NDRangeIterator iterator(unsigned int start, unsigned int end) const
+ {
return NDRangeIterator(*this, start, end);
}
- unsigned int total_size() const {
+ unsigned int total_size() const
+ {
return m_totalsizes[D - 1];
}
- unsigned int get_size(unsigned int v) const {
+ unsigned int get_size(unsigned int v) const
+ {
return m_sizes[v];
}
};
@@ -128,58 +147,53 @@ public:
/** NDCoordinate builds upon a range, but specifies a starting position
* in addition to a size which it inherits from NDRange
*/
-template<unsigned int N>
-class NDCoordinate : public NDRange<N> {
- using int_t =unsigned int;
+template <unsigned int N>
+class NDCoordinate : public NDRange<N>
+{
+ using int_t = unsigned int;
using ndrange_t = NDRange<N>;
- std::array<int_t, N> m_positions {};
+ std::array<int_t, N> m_positions{};
+
public:
- NDCoordinate& operator=(const NDCoordinate& rhs)=default;
- NDCoordinate(const NDCoordinate& rhs) =default;
- NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>>& list)
+ NDCoordinate &operator=(const NDCoordinate &rhs) = default;
+ NDCoordinate(const NDCoordinate &rhs) = default;
+ NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>> &list)
{
std::array<int_t, N> sizes{};
std::size_t i = 0;
- for(auto& p : list) {
- m_positions[i]= p.first;
- sizes[i++] = p.second;
+ for(auto &p : list)
+ {
+ m_positions[i] = p.first;
+ sizes[i++] = p.second;
}
//update the parents sizes
- static_cast<ndrange_t&>(*this) = ndrange_t(sizes);
+ static_cast<ndrange_t &>(*this) = ndrange_t(sizes);
}
- int_t get_position(int_t d) const {
- assert(d < m_positions.size());
+ int_t get_position(int_t d) const
+ {
+ assert(d < N);
+
return m_positions[d];
}
- void set_position(int_t d, int_t v) {
- assert(d < size(m_positions));
- assert(v < ndrange_t::get_size(d));
+ void set_position(int_t d, int_t v)
+ {
+ assert(d < N);
m_positions[d] = v;
}
- int_t get_position_end(int_t d) const {
- return get_position(d) + NDRange<N>::get_size(d);
+ int_t get_position_end(int_t d) const
+ {
+ return get_position(d) + ndrange_t::get_size(d);
}
}; //class NDCoordinate
-/** @returns the number of dimensions in the NDRange which have none-1 values
- * IE there is actual work in these dimensions that can be broken up
- */
-template<unsigned int N>
-std::size_t ndrange_popcount(const NDRange<N>& ndr) {
- std::size_t count = 0;
-
- for(unsigned int d = 0; d != N; ++d) {
- if(ndr.get_size(d) != 1)
- ++count;
- }
- return count;
-}
+using ndrange_t = NDRange<6>;
+using ndcoord_t = NDCoordinate<6>;
} // namespace arm_gemm
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index d8f01a9066..e874f0f14b 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp
@@ -73,6 +73,7 @@ bool model_supports_dot(CPUModel model)
{
case CPUModel::GENERIC_FP16_DOT:
case CPUModel::A55r1:
+ case CPUModel::X1:
return true;
default:
return false;
@@ -86,6 +87,7 @@ bool model_supports_fp16(CPUModel model)
case CPUModel::GENERIC_FP16:
case CPUModel::GENERIC_FP16_DOT:
case CPUModel::A55r1:
+ case CPUModel::X1:
return true;
default:
return false;
@@ -121,6 +123,9 @@ CPUModel midr_to_model(const unsigned int midr)
model = CPUModel::A55r0;
}
break;
+ case 0xd44: // X1
+ model = CPUModel::X1;
+ break;
case 0xd0a: // A75
if(variant != 0)
{
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 1fcac58e10..8a2506f39a 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -284,7 +284,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c
//if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
//the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
{
- const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm);
+ const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
if(window_size < static_cast<unsigned int>(args._maxthreads))
{
_gemm_kernel_asm->set_nthreads(window_size);
@@ -408,7 +408,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run()
if(_workspace.buffer() != nullptr)
{
_gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer()));
- const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm);
+ const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
unsigned int num_threads = NEScheduler::get().num_threads();
if(window_size < num_threads)
{