From 5aa1a0b7ca5eed010e4b297a95b1c4851f741328 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 2 Jul 2020 20:02:20 +0100 Subject: COMPID-3324: Clean GEMM kernels Signed-off-by: Georgios Pinitas Change-Id: I170de1671e061a78740caee31fb4a1b8642c1369 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3505 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Michele Di Giorgio --- Android.bp | 8 + SConstruct | 2 +- arm_compute/core/CPP/CPPTypes.h | 9 +- src/core/NEON/kernels/arm_gemm/barrier.hpp | 2 +- src/core/NEON/kernels/arm_gemm/bfloat.hpp | 3 +- src/core/NEON/kernels/arm_gemm/bias_adder.hpp | 2 +- src/core/NEON/kernels/arm_gemm/buffer_manager.hpp | 20 +- src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp | 6 +- src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp | 2 +- src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp | 30 +- src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp | 26 +- .../kernels/arm_gemm/gemm_hybrid_quantized.hpp | 23 +- .../NEON/kernels/arm_gemm/gemm_implementation.hpp | 4 +- src/core/NEON/kernels/arm_gemm/gemm_int16.cpp | 2 +- src/core/NEON/kernels/arm_gemm/gemm_int8.cpp | 6 +- .../NEON/kernels/arm_gemm/gemm_interleaved.hpp | 21 +- .../NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp | 8 +- .../arm_gemm/gemm_interleaved_pretransposed_2d.hpp | 9 +- src/core/NEON/kernels/arm_gemm/gemm_native.hpp | 16 +- src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp | 2 +- src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp | 6 +- src/core/NEON/kernels/arm_gemm/gemv_batched.hpp | 10 +- .../kernels/arm_gemm/gemv_native_transposed.hpp | 20 +- .../NEON/kernels/arm_gemm/gemv_pretransposed.hpp | 20 +- .../kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp | 4 +- .../kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp | 3 + .../arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp | 347 ++ .../kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp | 4 +- .../kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp | 4 +- .../kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp | 3 + .../arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp | 348 ++ .../kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp | 6 +- .../kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp | 5 +- .../kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp | 350 ++ .../arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp | 5 +- .../kernels/a64_hybrid_fp32_mla_16x4/a55.cpp | 17 +- .../kernels/a64_hybrid_fp32_mla_16x4/generic.cpp | 17 +- .../kernels/a64_hybrid_fp32_mla_16x4/x1.cpp | 1810 ++++++++++ .../arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp | 6 +- .../kernels/a64_hybrid_fp32_mla_4x8/generic.cpp | 17 +- .../arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp | 2 +- .../kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp | 21 +- .../kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp | 21 +- .../arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp | 2 +- .../kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp | 22 +- .../kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp | 22 +- .../kernels/a64_interleaved_bf16fp32_dot_12x8.hpp | 10 +- .../a64_interleaved_bf16fp32_dot_12x8/generic.cpp | 38 +- .../a64_interleaved_bf16fp32_dot_12x8/x1.cpp | 328 ++ .../kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp | 7 +- .../a64_interleaved_bf16fp32_mmla_12x8/generic.cpp | 53 +- .../kernels/a64_interleaved_s8s32_mmla_12x8.hpp | 6 +- .../a64_interleaved_s8s32_mmla_12x8/generic.cpp | 30 +- .../kernels/a64_interleaved_u8u32_mmla_12x8.hpp | 6 +- .../a64_interleaved_u8u32_mmla_12x8/generic.cpp | 30 +- .../arm_gemm/kernels/a64_native_fp32_mla_16x4.hpp | 8 +- .../kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp | 5 + .../kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp | 354 ++ .../arm_gemm/kernels/a64_sgemv_pretransposed.hpp | 4 +- .../kernels/arm_gemm/kernels/a64_sgemv_trans.hpp | 4 +- .../kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp | 5 +- .../kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp | 7 +- .../kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp | 7 +- .../sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp | 17 +- .../kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp | 7 +- .../sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp | 17 +- .../kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp | 7 +- .../sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp | 17 +- .../kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp | 7 +- .../sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp | 17 +- .../arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp | 7 +- .../kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp | 17 +- .../arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp | 7 +- .../kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp | 17 +- .../kernels/sve_hybrid_fp32_mmla_4VLx4.hpp | 89 + .../kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp | 3459 ++++++++++++++++++++ .../kernels/sve_hybrid_s8s32_dot_4VLx4.hpp | 7 +- .../kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp | 17 +- .../kernels/sve_hybrid_u8u32_dot_4VLx4.hpp | 7 +- .../kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp | 17 +- .../kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp | 7 +- .../sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp | 20 +- .../sve_interleaved_bf16fp32_mmla_3VLx8.hpp | 7 +- .../generic.cpp | 32 +- .../kernels/sve_interleaved_fp16_mla_3VLx8.hpp | 7 +- .../sve_interleaved_fp16_mla_3VLx8/generic.cpp | 33 +- .../kernels/sve_interleaved_fp32_mla_3VLx8.hpp | 7 +- .../sve_interleaved_fp32_mla_3VLx8/generic.cpp | 31 +- .../kernels/sve_interleaved_fp32_mmla_3VLx8.hpp | 72 + .../sve_interleaved_fp32_mmla_3VLx8/generic.cpp | 397 +++ .../kernels/sve_interleaved_s8s32_dot_3VLx8.hpp | 7 +- .../sve_interleaved_s8s32_dot_3VLx8/generic.cpp | 31 +- .../kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp | 6 +- .../sve_interleaved_s8s32_mmla_3VLx8/generic.cpp | 32 +- .../kernels/sve_interleaved_u8u32_dot_3VLx8.hpp | 7 +- .../sve_interleaved_u8u32_dot_3VLx8/generic.cpp | 31 +- .../kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp | 6 +- .../sve_interleaved_u8u32_mmla_3VLx8/generic.cpp | 32 +- .../kernels/sve_native_bf16fp32_dot_4VLx4.hpp | 7 +- .../sve_native_bf16fp32_dot_4VLx4/generic.cpp | 249 +- .../arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp | 10 +- .../kernels/sve_native_fp16_mla_4VLx4/generic.cpp | 295 +- .../arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp | 10 +- .../kernels/sve_native_fp32_mla_4VLx4/generic.cpp | 199 +- .../kernels/sve_native_s8s32_dot_4VLx4.hpp | 7 +- .../kernels/sve_native_s8s32_dot_4VLx4/generic.cpp | 183 +- .../kernels/sve_native_u8u32_dot_4VLx4.hpp | 7 +- .../kernels/sve_native_u8u32_dot_4VLx4/generic.cpp | 183 +- .../kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp | 7 +- .../kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp | 5 +- .../kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp | 5 +- .../arm_gemm/merges/a64_merge_fp16_24x8.hpp | 8 +- .../arm_gemm/merges/a64_merge_fp32_12x8.hpp | 12 +- .../kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp | 12 +- .../kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp | 12 +- .../kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp | 12 +- .../kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp | 12 +- src/core/NEON/kernels/arm_gemm/merges/list.hpp | 2 +- .../arm_gemm/merges/sve_merge_fp16_3VLx8.hpp | 8 +- .../arm_gemm/merges/sve_merge_fp32_3VLx8.hpp | 8 +- .../arm_gemm/merges/sve_merge_s32_3VLx8.hpp | 13 +- .../arm_gemm/merges/sve_merge_u32_3VLx8.hpp | 13 +- src/core/NEON/kernels/arm_gemm/misc.cpp | 3 +- .../NEON/kernels/arm_gemm/quantize_wrapper.hpp | 7 +- src/core/NEON/kernels/arm_gemm/quantized.cpp | 10 +- .../NEON/kernels/arm_gemm/std_transforms_fixed.hpp | 8 +- src/core/NEON/kernels/arm_gemm/transform.hpp | 2 +- .../transforms/a32_interleave_6way_32bit.hpp | 3 +- .../transforms/a64_block16_interleave4_8bit.hpp | 12 +- .../transforms/a64_interleave_8way_16bit.hpp | 3 +- .../transforms/a64_interleave_8way_32bit.hpp | 3 +- .../transforms/a64_interleave_8way_block4_8bit.hpp | 5 +- .../a64_interleave_8way_half_to_float.hpp | 3 +- ...64_transpose_interleave_12way_half_to_float.hpp | 2 +- .../a64_transpose_interleave_24way_16bit.hpp | 2 +- src/core/NEON/kernels/arm_gemm/transforms/list.hpp | 3 +- .../transforms/sve_interleave_8way_32bit.hpp | 2 +- .../sve_interleave_8way_block2_32bit.hpp | 552 ++-- .../transforms/sve_interleave_8way_block4_8bit.hpp | 2 +- src/core/NEON/kernels/arm_gemm/utils.hpp | 4 +- src/core/NEON/kernels/assembly/arm_gemm.hpp | 106 +- src/core/NEON/kernels/assembly/gemm_common.hpp | 150 +- src/core/NEON/kernels/assembly/ndrange.hpp | 158 +- src/runtime/CPUUtils.cpp | 5 + .../NEON/functions/NEGEMMAssemblyDispatch.cpp | 4 +- 145 files changed, 9423 insertions(+), 1598 deletions(-) create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp diff --git a/Android.bp b/Android.bp index 0c0f3b22ad..d19db113d5 100644 --- a/Android.bp +++ b/Android.bp @@ -776,21 +776,26 @@ cc_library_static { "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp", @@ -799,6 +804,7 @@ cc_library_static { "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp", @@ -817,12 +823,14 @@ cc_library_static { "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp", diff --git a/SConstruct b/SConstruct index 668e9a73d9..2832b90afe 100644 --- a/SConstruct +++ b/SConstruct @@ -206,7 +206,7 @@ elif 'v8' in env['arch']: env.Append(CXXFLAGS = ['-march=armv8-a']) if 'v8.6-a' in env['arch']: - env.Append(CPPDEFINES = ['V8P6', 'V8P6_BF', 'ARM_COMPUTE_FORCE_BF16']) + env.Append(CPPDEFINES = ['MMLA_INT8', 'MMLA_FP32', 'V8P6', 'V8P6_BF', 'ARM_COMPUTE_FORCE_BF16']) elif 'x86' in env['arch']: if env['estate'] == '32': diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h index d3f6fc944d..59aecd2176 100644 --- a/arm_compute/core/CPP/CPPTypes.h +++ b/arm_compute/core/CPP/CPPTypes.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -44,7 +44,8 @@ enum class CPUModel GENERIC_FP16_DOT, A53, A55r0, - A55r1 + A55r1, + X1 }; /** Global memory policy. @@ -94,6 +95,10 @@ inline std::string cpu_model_to_string(CPUModel val) { return std::string("A55r1"); } + case CPUModel::X1: + { + return std::string("X1"); + } default: { ARM_COMPUTE_ERROR("Invalid CPUModel."); diff --git a/src/core/NEON/kernels/arm_gemm/barrier.hpp b/src/core/NEON/kernels/arm_gemm/barrier.hpp index cfd1079f74..8fbcddfef8 100644 --- a/src/core/NEON/kernels/arm_gemm/barrier.hpp +++ b/src/core/NEON/kernels/arm_gemm/barrier.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/NEON/kernels/arm_gemm/bfloat.hpp b/src/core/NEON/kernels/arm_gemm/bfloat.hpp index 547c668157..e585e59890 100644 --- a/src/core/NEON/kernels/arm_gemm/bfloat.hpp +++ b/src/core/NEON/kernels/arm_gemm/bfloat.hpp @@ -29,5 +29,4 @@ namespace arm_gemm { using bfloat16 = arm_compute::bfloat16; -} // namespace arm_gemm - +} // namespace arm_gemm \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/bias_adder.hpp b/src/core/NEON/kernels/arm_gemm/bias_adder.hpp index 745d00563b..5d363fd68b 100644 --- a/src/core/NEON/kernels/arm_gemm/bias_adder.hpp +++ b/src/core/NEON/kernels/arm_gemm/bias_adder.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp index 001cab7f09..268b9ba6c7 100644 --- a/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp +++ b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp @@ -303,32 +303,22 @@ public: BufferManager(BufferManager &) = delete; BufferManager & operator=(BufferManager &) = delete; - BufferManager(const int maxthreads, const size_t buffersize, void *storage) : _storage(storage) { - UNUSED(maxthreads); - UNUSED(buffersize); - } + BufferManager(const int, const size_t, void *storage) : _storage(storage) { } ~BufferManager() { } // Say how much storage is needed. - static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize) { - UNUSED(maxthreads); + static inline size_t get_storage_requirement(const int, const size_t buffersize) { return buffersize; } template - void try_populate(const int index, T func) { - UNUSED(index); - UNUSED(func); - } + void try_populate(const int, T) { } - void release(const int index) { - UNUSED(index); - } + void release(const int) { } template - void *get(const int index, T func) { - UNUSED(index); + void *get(const int, T func) { func(_storage); return _storage; } diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp index 10fee472f4..fad0e84bbb 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_gemm.hpp" +#include "bfloat.hpp" #include "gemm_common.hpp" #include "gemm_hybrid.hpp" #include "gemm_implementation.hpp" @@ -43,11 +44,8 @@ #include "kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp" #include "kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp" -#include "bfloat.hpp" - namespace arm_gemm { - static const GemmImplementation gemm_bf16_methods[] = { #ifdef V8P6_BF diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp index b6671e8c85..6867a5f4b9 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp index 8bef2b7bae..1d5b97b41a 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -44,7 +44,9 @@ #include "kernels/a64_sgemv_trans.hpp" #include "kernels/sve_hybrid_fp32_mla_4VLx4.hpp" +#include "kernels/sve_hybrid_fp32_mmla_4VLx4.hpp" #include "kernels/sve_interleaved_fp32_mla_3VLx8.hpp" +#include "kernels/sve_interleaved_fp32_mmla_3VLx8.hpp" #include "kernels/sve_native_fp32_mla_4VLx4.hpp" #include "kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp" @@ -75,6 +77,23 @@ static const GemmImplementation gemm_fp32_methods[] = [](const GemmArgs &args) { return new GemvNativeTransposed(args); } }, +#if defined(__ARM_FEATURE_SVE) && defined(MMLA_FP32) +{ + GemmMethod::GEMM_HYBRID, + "hybrid_fp32_mmla_4VLx4", + [](const GemmArgs &args) { return (args._Ksize >= 4) && !args._trA && args._pretransposed_hint; }, + [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return new GemmHybrid(args); } +}, +{ + GemmMethod::GEMM_INTERLEAVED, + "interleaved_fp32_mmla_3VLx8", + [](const GemmArgs &args) { return (args._Ksize>4); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +#endif // __ARM_FEATURE_SVE && MMLA_FP32 + #ifdef __ARM_FEATURE_SVE // SVE smallk / native / hybrid methods { @@ -124,7 +143,7 @@ static const GemmImplementation gemm_fp32_methods[] = }, { GemmMethod::GEMM_HYBRID, - "hybrid_fp32_mla_16x4_normal", + "hybrid_fp32_mla_16x4", [](const GemmArgs &args) { return (args._Ksize >= 4) && !args._trA && args._pretransposed_hint; }, [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || (args._Msize < 16) || (args._nmulti > 1); }, [](const GemmArgs &args) { return new GemmHybrid(args); } @@ -146,7 +165,7 @@ static const GemmImplementation gemm_fp32_methods[] = [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif // __ARM_FEATURE_SVE -//Pretranpose, 2D split +// Pretranposed, 2D split { GemmMethod::GEMM_INTERLEAVED_2D, "sgemm_12x8_pretranspose_2d", @@ -154,7 +173,7 @@ static const GemmImplementation gemm_fp32_methods[] = [](const GemmArgs &args) { return args._maxthreads >= 8; }, [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d(args); } }, -//Tranpose, 2D split, no blockmanager +// Non-pretransposed, 2D split (no buffer manager) { GemmMethod::GEMM_INTERLEAVED_2D, "sgemm_12x8_2d", @@ -162,7 +181,7 @@ static const GemmImplementation gemm_fp32_methods[] = [](const GemmArgs &args) { return (!args._pretransposed_hint) && (args._maxthreads >= 8); }, [](const GemmArgs &args) { return new GemmInterleaved2d(args); } }, -//Tranpose, 1D split, with blockmanager +// 1D split (with pretransposed or not) { GemmMethod::GEMM_INTERLEAVED, "sgemm_12x8_1d", @@ -170,7 +189,6 @@ static const GemmImplementation gemm_fp32_methods[] = nullptr, [](const GemmArgs &args) { return new GemmInterleaved(args); } }, - #endif // __aarch64__ #ifdef __arm__ diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp index 0ce323e09d..2c666b63c2 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,9 +29,8 @@ #include "arm_gemm.hpp" #include "bias_adder.hpp" -#include "utils.hpp" - #include "ndrange.hpp" +#include "utils.hpp" #include "mergeresults.hpp" #include "transform.hpp" @@ -144,7 +143,7 @@ public: // Interface implementation - Compulsory functions ndrange_t get_window_size() const override { - return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u }; + return { _window_range.total_size() }; } // This kernel can always be dynamically scheduled. @@ -152,8 +151,8 @@ public: return true; } - void execute_1d(unsigned int start, unsigned int end, int threadid) { - UNUSED(threadid); + // Execute + void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -174,7 +173,7 @@ public: const bool first_pass = (k0 == 0); const bool last_pass = (kmax == _Ksize); - auto p = _window_range.iterator(start, end); + auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0)); if (p.done()) { return; @@ -194,7 +193,7 @@ public: (n0 * kern_k); #ifdef CYCLE_PROFILING - auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); #endif strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda, @@ -215,17 +214,6 @@ public: } } - // Execute - void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { - UNUSED(thread_locator); - - const auto start = work_range.get_position(0); - const auto size = work_range.get_size(0); - const auto stop = start + size; - - execute_1d(start, stop, threadid); - } - // Interface implementation - pretransposed bool B_is_pretransposed() const override { return true; diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp index 2b936d0b8f..36545c16ba 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,9 +28,8 @@ #include #include "arm_gemm.hpp" -#include "utils.hpp" - #include "ndrange.hpp" +#include "utils.hpp" #include "mergeresults.hpp" #include "transform.hpp" @@ -151,7 +150,7 @@ public: // Interface implementation - Compulsory functions ndrange_t get_window_size() const override { - return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u }; + return { _window_range.total_size() }; } // This kernel can always be dynamically scheduled. @@ -159,7 +158,8 @@ public: return true; } - void execute_1d(unsigned int start, unsigned int end, int threadid) { + // Execute + void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -180,7 +180,7 @@ public: unsigned int kmax = std::min(k0 + _k_block, _Ksize); unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll()); - auto p = _window_range.iterator(start, end); + auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0)); if (p.done()) { return; @@ -234,17 +234,6 @@ public: } } - // Execute - void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { - UNUSED(thread_locator); - - const auto start = work_range.get_position(0); - const auto size = work_range.get_size(0); - const auto stop = start + size; - - execute_1d(start, stop, threadid); - } - // Working space needed for intermediate result buffers. size_t get_working_size() const override { return (_nthreads * strategy::out_height() * _Nsize * sizeof(Tri)); diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp index 569d1f44ca..c726d7b0aa 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 ARM Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,7 +22,7 @@ * SOFTWARE. */ -#include +#include "arm_gemm.hpp" #include diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp index e3b4416f68..da682330a0 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp index f7d8f65aea..8dd0df5603 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -47,7 +47,7 @@ namespace arm_gemm { static const GemmImplementation gemm_s8_methods[] = { #ifdef __ARM_FEATURE_SVE -#ifdef V8P6 +#ifdef MMLA_INT8 { GemmMethod::GEMM_INTERLEAVED, "interleaved_s8s32_mmla_3VLx8", @@ -85,7 +85,7 @@ static const GemmImplementation gemm_s8_methods[] = { [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif -#ifdef V8P6 +#ifdef MMLA_INT8 { GemmMethod::GEMM_INTERLEAVED, "interleaved_s8s32_mmla_12x8", diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index 4897bedf47..f572f7940b 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -386,8 +386,8 @@ public: // not multi for now (as this would cause problems with the buffer // manager). ndrange_t get_window_size() const override { - auto m_win_size = (_Mround / strategy::out_height()) * _nbatches; - return { m_win_size, 1u, 1u, 1u, 1u, 1u }; + // _Mround is a multiple of out_height by definition. + return { (_Mround / strategy::out_height()) * _nbatches }; } // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. @@ -399,7 +399,10 @@ public: } // Execute - void execute_1d(unsigned int start, unsigned int end, int threadid) { + void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override { + const auto start = work_range.get_position(0); + const auto end = work_range.get_position_end(0); + if (_pretransposed) { execute_internal(start, end, threadid); } else { @@ -407,16 +410,6 @@ public: } } - //Execute - void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { - UNUSED(thread_locator); - - const auto start = work_range.get_position(0); - const auto stop = work_range.get_position_end(0); - - execute_1d(start, stop, threadid); - } - // Interface implementation - working space size_t get_working_size() const override { // In all cases, we need one A buffer plus a C buffer per thread. diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp index 53f8e6c938..376d19cc65 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp @@ -170,9 +170,7 @@ class GemmInterleaved2d : public GemmCommon { return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height()); } - void execute_transpose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) { - UNUSED(mthreadid); - + void execute_transpose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int, int nthreadid) { strategy strat(_ci); /* Translate 'start' and 'end' into a position within the batches and rows. */ @@ -382,7 +380,7 @@ public: unsigned m = (_Mround / strategy::out_height()) * _nbatches; unsigned n = _Nround_div; - return { m, n, 1u, 1u, 1u, 1u }; + return { m, n }; } // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. @@ -395,8 +393,6 @@ public: * This particular GEMM implementation can only be broken up over the M & N * dimensions, we inform the frame work of this limitation via the get_window_size function */ - assert(ndrange_popcount(work_range) <= 2); - const auto m_start = work_range.get_position(0); const auto n_start = work_range.get_position(1); const auto m_size = work_range.get_size(0); diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp index eff4877198..38fb26370c 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp @@ -173,16 +173,13 @@ class GemmInterleavedPretransposed2d : public GemmCommon { // Internal execute function. // This supports both the "pretransposed" and "standard" interfaces via the template parameter. - void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) { + void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int, int) { /* Make sure we've been set up correctly. */ assert(_B_transposed); assert(_working_space); assert(this->_Aptr); assert(this->_Cptr); - UNUSED(mthreadid); - UNUSED(nthreadid); - #ifdef CYCLE_PROFILING profiler prof; #endif @@ -389,7 +386,7 @@ public: unsigned m = (_Mround / strategy::out_height()) * _nbatches; unsigned n = _Nround_div; - return { m, n, 1u, 1u, 1u, 1u }; + return { m, n }; } // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. @@ -401,8 +398,6 @@ public: /* This particular GEMM implementation can only be broken up over the M & N * dimensions, we inform the frame work of this limitation via the get_window_size function */ - assert(ndrange_popcount(work_range) <= 2); - const auto m_start = work_range.get_position(0); const auto n_start = work_range.get_position(1); const auto m_size = work_range.get_size(0); diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp index c2f742b5cf..cddbd51e32 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp @@ -88,7 +88,7 @@ public: // Window is amount per multi multiplied by total number of multis. ndrange_t get_window_size() const override { - return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u }; + return { _window_range.total_size() }; } // Native GEMMs can always be dynamically scheduled (whether requested or not) @@ -97,7 +97,7 @@ public: } // Actually execute the GEMM. - void execute_1d(unsigned int start, unsigned int end, int) { + void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override { #ifdef CYCLE_PROFILING profiler prof; #endif @@ -106,7 +106,7 @@ public: static_assert(std::is_same::value, "gemm_native: Operand types must be the same."); static_assert(std::is_same::value, "gemm_native: Result types must be the same."); - auto p = _window_range.iterator(start, end); + auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0)); if (p.done()) { return; @@ -139,16 +139,6 @@ public: } } while (p.next_dim1()); } - - //Execute - void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { - UNUSED(thread_locator); - - const auto start = work_range.get_position(0); - const auto stop = work_range.get_position_end(0); - - execute_1d(start, stop, threadid); - } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp index 85a8a6720a..5e06443e19 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp index 430d35e06d..d74f335e38 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -47,7 +47,7 @@ namespace arm_gemm { static const GemmImplementation gemm_u8_methods[] = { #ifdef __ARM_FEATURE_SVE -#ifdef V8P6 +#ifdef MMLA_INT8 { GemmMethod::GEMM_INTERLEAVED, "interleaved_u8u32_mmla_3VLx8", @@ -85,7 +85,7 @@ static const GemmImplementation gemm_u8_methods[] = { [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif -#ifdef V8P6 +#ifdef MMLA_INT8 { GemmMethod::GEMM_INTERLEAVED, "interleaved_u8u32_mmla_12x8", diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp index 939788ed8d..12216009d2 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -45,17 +45,15 @@ public: _subgemm = gemm(newargs); } - void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, + void set_arrays(const To *A, const int, const int A_batch_stride, const int A_multi_stride, const To *B, const int ldb, const int B_multi_stride, - Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride, + Tr *C, const int, const int C_batch_stride, const int C_multi_stride, const Tr *bias, const int bias_multi_stride) override { /* A and C's batch stride becomes their new row stride. New batch stride is 0 as nbatches for subgemm is always 1. */ _subgemm->set_arrays(A, A_batch_stride, 0, A_multi_stride, B, ldb, B_multi_stride, C, C_batch_stride, 0, C_multi_stride, bias, bias_multi_stride); - UNUSED(lda); - UNUSED(ldc); } ndrange_t get_window_size() const override { @@ -66,7 +64,7 @@ public: _subgemm->set_nthreads(nthreads); } - void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) override { _subgemm->execute(work_range, thread_locator, threadid); } diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp index 190f4aa643..9209d48bd9 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -73,16 +73,19 @@ public: // Window is number of out_width blocks times number of multis. ndrange_t get_window_size() const override { - return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u }; + return { iceildiv(_Nsize, strategy::out_width()) * _nmultis }; } // Actually execute the GEMV. - void execute_1d(unsigned int start, unsigned int end, int) { + void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override { #ifdef CYCLE_PROFILING profiler prof; #endif strategy strat(_ci); + const auto start = work_range.get_position(0); + const auto end = work_range.get_position_end(0); + const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width()); const unsigned int multi_0 = start / window_per_multi; const unsigned int multi_end = end / window_per_multi; @@ -127,17 +130,6 @@ public: } } } - - // Execute - void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { - UNUSED(thread_locator); - - const auto start = work_range.get_position(0); - const auto size = work_range.get_size(0); - const auto stop = start + size; - - execute_1d(start, stop, threadid); - } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp index 7f52ac5a14..945e363839 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -87,16 +87,19 @@ public: // Window is number of out_width blocks, times number of multis. ndrange_t get_window_size() const override { - return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u }; + return { iceildiv(_Nsize, strategy::out_width()) * _nmultis }; } // Actually execute the GEMV. - void execute_1d(unsigned int start, unsigned int end, int) { + void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override { #ifdef CYCLE_PROFILING profiler prof; #endif strategy strat(_ci); + const auto start = work_range.get_position(0); + const auto end = work_range.get_position_end(0); + /* Break the window values down into multis of interest... */ const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width()); const unsigned int multi_0 = start / window_per_multi; @@ -145,17 +148,6 @@ public: } } - // Execute - void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { - UNUSED(thread_locator); - - const auto start = work_range.get_position(0); - const auto size = work_range.get_size(0); - const auto stop = start + size; - - execute_1d(start, stop, threadid); - } - /* Pretransposed interface implementation */ bool B_is_pretransposed() const override { return true; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp index 8700c42f5d..0f0e5a7ed4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -65,7 +65,7 @@ public: kern_type kernel = a64_gemm_s16_asimd_12x8; - gemm_s16_12x8(const CPUInfo *ci) { UNUSED(ci); } + gemm_s16_12x8(const CPUInfo *) { } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp index cc6c583b33..e5b295b640 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp @@ -34,6 +34,7 @@ namespace arm_gemm { // Load the actual kernel void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int); void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int); +void a64_gemm_s8_12x8_x1(const int8_t *, const int8_t *, int32_t *, int, int, int); class gemm_s8_12x8 { public: @@ -65,6 +66,8 @@ public: if (mod == CPUModel::A55r1) { kernel = a64_gemm_s8_12x8_a55r1; + } else if (mod == CPUModel::X1) { + kernel = a64_gemm_s8_12x8_x1; } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp new file mode 100644 index 0000000000..446fcf8707 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_gemm_s8_12x8_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + // We divide K by 4 because the sdot instruction processes 4 elements at a time. + const int W = K/4; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int init_value_k = ((W+1)/2) - 1; + for (int yb=0; yb + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_gemm_u8_12x8_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + // We divide K by 4 because the udot instruction processes 4 elements at a time. + const int W = K/4; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int init_value_k = ((W+1)/2) - 1; + for (int yb=0; yb + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 24x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm { + +void a64_hgemm_asimd_24x8_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { + const __fp16 *a_ptr = Apanel; + __fp16 *c_ptr = Cpanel; + + for (int yb=0; ybget_cpu_model() == CPUModel::A55r1) { kernel = a64_hybrid_fp32_mla_16x4_a55; + } else if (ci->get_cpu_model() == CPUModel::X1) { + kernel = a64_hybrid_fp32_mla_16x4_x1; } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp index 5bce632bc4..1b828ee503 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,12 +61,23 @@ void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float break; } - for (int y=0; y 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0 + +#include "arm_gemm.hpp" + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_hybrid_fp32_mla_16x4_x1(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) { + const int K_stride = K; + const long loops_count = ((K + 4) / 8) - 1; + K -= loops_count * 8; + const long regs_count = (K / 4) - 1; + K -= (regs_count + 1) * 4; + const long blocks_count = K / 1; + float nullbias[16]; + if (!append && !bias) { + memset(nullbias, 0, (16 * sizeof(float))); + } + float minval = - static_cast(std::numeric_limits::infinity()); + float maxval = static_cast(std::numeric_limits::infinity()); + const float * const minptr = &minval; + const float * const maxptr = &maxval; + + switch(act.type) + { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + minval = 0.0f; + break; + } + + int rows_to_compute; + + for (int y=0; y 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + + for (int x0=0; x0(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "cbnz %[append], 1f\n" + "ldr q16, [%[biasptr]]\n" + "ldr q17, [%[biasptr], #0x10]\n" + "ldr q18, [%[biasptr], #0x20]\n" + "ldr q19, [%[biasptr], #0x30]\n" + "mov v20.16b, v16.16b\n" + "ldr q0, [%[a_ptr0]]\n" + "mov v21.16b, v17.16b\n" + "ldr q1, [a_ptr1]\n" + "mov v22.16b, v18.16b\n" + "ldr q8, [%[b_ptr0]]\n" + "mov v23.16b, v19.16b\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "ldr q20, [c_ptr1]\n" + "ldr q21, [c_ptr1, #0x10]\n" + "ldr q22, [c_ptr1, #0x20]\n" + "ldr q23, [c_ptr1, #0x30]\n" + "ldr q0, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q1, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q8, [%[b_ptr0]]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "3:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q5, [a_ptr1]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "ldr q1, [a_ptr1, #-0x10]\n" + "fmla v20.4s, v8.4s, v5.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "fmla v21.4s, v9.4s, v5.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "fmla v22.4s, v10.4s, v5.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "fmla v23.4s, v11.4s, v5.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "fmla v20.4s, v8.4s, v5.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "fmla v21.4s, v9.4s, v5.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "fmla v22.4s, v10.4s, v5.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "fmla v23.4s, v11.4s, v5.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v5.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "fmla v21.4s, v9.4s, v5.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "fmla v22.4s, v10.4s, v5.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "fmla v23.4s, v11.4s, v5.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "fmla v20.4s, v8.4s, v5.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "fmla v21.4s, v9.4s, v5.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "fmla v22.4s, v10.4s, v5.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "fmla v23.4s, v11.4s, v5.s[3]\n" + "b.ne 3b\n" + "2:\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "cbz %[regs], 4f\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q5, [a_ptr1]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "fmla v20.4s, v8.4s, v5.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "fmla v21.4s, v9.4s, v5.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "fmla v22.4s, v10.4s, v5.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "fmla v23.4s, v11.4s, v5.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "fmla v20.4s, v8.4s, v5.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "fmla v21.4s, v9.4s, v5.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "fmla v22.4s, v10.4s, v5.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "fmla v23.4s, v11.4s, v5.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v5.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "fmla v21.4s, v9.4s, v5.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "fmla v22.4s, v10.4s, v5.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "fmla v23.4s, v11.4s, v5.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v20.4s, v8.4s, v5.s[3]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "fmla v21.4s, v9.4s, v5.s[3]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "fmla v22.4s, v10.4s, v5.s[3]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "fmla v23.4s, v11.4s, v5.s[3]\n" + "b 5f\n" + "4:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "5:\n" + "cbz %[blocks], 6f\n" + "7:\n" + "ldr q8, [%[b_ptr0]]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr s0, [%[a_ptr0]]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], #0x4\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr s1, [a_ptr1]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "add a_ptr1, a_ptr1, #0x4\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "b.ne 7b\n" + "6:\n" + "ld1r {v14.4s}, [%[minptr]]\n" + "ld1r {v15.4s}, [%[maxptr]]\n" + "fmax v16.4s, v16.4s, v14.4s\n" + "fmax v17.4s, v17.4s, v14.4s\n" + "fmax v18.4s, v18.4s, v14.4s\n" + "fmax v19.4s, v19.4s, v14.4s\n" + "fmin v16.4s, v16.4s, v15.4s\n" + "fmin v17.4s, v17.4s, v15.4s\n" + "fmin v18.4s, v18.4s, v15.4s\n" + "fmin v19.4s, v19.4s, v15.4s\n" + "str q16, [%[c_ptr0]]\n" + "fmax v20.4s, v20.4s, v14.4s\n" + "fmax v21.4s, v21.4s, v14.4s\n" + "fmax v22.4s, v22.4s, v14.4s\n" + "str q17, [%[c_ptr0], #0x10]\n" + "fmax v23.4s, v23.4s, v14.4s\n" + "fmin v20.4s, v20.4s, v15.4s\n" + "fmin v21.4s, v21.4s, v15.4s\n" + "str q18, [%[c_ptr0], #0x20]\n" + "fmin v22.4s, v22.4s, v15.4s\n" + "fmin v23.4s, v23.4s, v15.4s\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "cbnz %[append], 1f\n" + "ldr q16, [%[biasptr]]\n" + "ldr q17, [%[biasptr], #0x10]\n" + "ldr q18, [%[biasptr], #0x20]\n" + "ldr q19, [%[biasptr], #0x30]\n" + "mov v20.16b, v16.16b\n" + "ldr q0, [%[a_ptr0]]\n" + "mov v21.16b, v17.16b\n" + "ldr q1, [a_ptr1]\n" + "mov v22.16b, v18.16b\n" + "ldr q2, [a_ptr2]\n" + "mov v23.16b, v19.16b\n" + "ldr q8, [%[b_ptr0]]\n" + "mov v24.16b, v16.16b\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "mov v25.16b, v17.16b\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "mov v26.16b, v18.16b\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov v27.16b, v19.16b\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "ldr q20, [c_ptr1]\n" + "ldr q21, [c_ptr1, #0x10]\n" + "ldr q22, [c_ptr1, #0x20]\n" + "ldr q23, [c_ptr1, #0x30]\n" + "ldr q24, [c_ptr2]\n" + "ldr q25, [c_ptr2, #0x10]\n" + "ldr q26, [c_ptr2, #0x20]\n" + "ldr q27, [c_ptr2, #0x30]\n" + "ldr q0, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q1, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q2, [a_ptr2]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ldr q8, [%[b_ptr0]]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "3:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "ldr q5, [a_ptr1]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q6, [a_ptr2]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + "fmla v24.4s, v8.4s, v2.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "fmla v25.4s, v9.4s, v2.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "fmla v26.4s, v10.4s, v2.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "fmla v27.4s, v11.4s, v2.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "fmla v24.4s, v8.4s, v2.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "fmla v25.4s, v9.4s, v2.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "fmla v27.4s, v11.4s, v2.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v24.4s, v8.4s, v2.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v25.4s, v9.4s, v2.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v26.4s, v10.4s, v2.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "ldr q1, [a_ptr1, #-0x10]\n" + "fmla v27.4s, v11.4s, v2.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "ldr q2, [a_ptr2, #-0x10]\n" + "fmla v20.4s, v8.4s, v5.s[0]\n" + "fmla v24.4s, v8.4s, v6.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "fmla v21.4s, v9.4s, v5.s[0]\n" + "fmla v25.4s, v9.4s, v6.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "fmla v22.4s, v10.4s, v5.s[0]\n" + "fmla v26.4s, v10.4s, v6.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "fmla v23.4s, v11.4s, v5.s[0]\n" + "fmla v27.4s, v11.4s, v6.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "fmla v20.4s, v8.4s, v5.s[1]\n" + "fmla v24.4s, v8.4s, v6.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "fmla v21.4s, v9.4s, v5.s[1]\n" + "fmla v25.4s, v9.4s, v6.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "fmla v22.4s, v10.4s, v5.s[1]\n" + "fmla v26.4s, v10.4s, v6.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "fmla v23.4s, v11.4s, v5.s[1]\n" + "fmla v27.4s, v11.4s, v6.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v5.s[2]\n" + "fmla v24.4s, v8.4s, v6.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "fmla v21.4s, v9.4s, v5.s[2]\n" + "fmla v25.4s, v9.4s, v6.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "fmla v22.4s, v10.4s, v5.s[2]\n" + "fmla v26.4s, v10.4s, v6.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "fmla v23.4s, v11.4s, v5.s[2]\n" + "fmla v27.4s, v11.4s, v6.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "fmla v20.4s, v8.4s, v5.s[3]\n" + "fmla v24.4s, v8.4s, v6.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "fmla v21.4s, v9.4s, v5.s[3]\n" + "fmla v25.4s, v9.4s, v6.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "fmla v22.4s, v10.4s, v5.s[3]\n" + "fmla v26.4s, v10.4s, v6.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "fmla v23.4s, v11.4s, v5.s[3]\n" + "fmla v27.4s, v11.4s, v6.s[3]\n" + "b.ne 3b\n" + "2:\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "prfm PSTL1KEEP, [c_ptr2]\n" + "cbz %[regs], 4f\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q5, [a_ptr1]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "ldr q6, [a_ptr2]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "fmla v24.4s, v8.4s, v2.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "fmla v25.4s, v9.4s, v2.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "fmla v26.4s, v10.4s, v2.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "fmla v27.4s, v11.4s, v2.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "fmla v24.4s, v8.4s, v2.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "fmla v25.4s, v9.4s, v2.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "fmla v27.4s, v11.4s, v2.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v24.4s, v8.4s, v2.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v25.4s, v9.4s, v2.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v26.4s, v10.4s, v2.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "fmla v27.4s, v11.4s, v2.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "fmla v20.4s, v8.4s, v5.s[0]\n" + "fmla v24.4s, v8.4s, v6.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "fmla v21.4s, v9.4s, v5.s[0]\n" + "fmla v25.4s, v9.4s, v6.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "fmla v22.4s, v10.4s, v5.s[0]\n" + "fmla v26.4s, v10.4s, v6.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "fmla v23.4s, v11.4s, v5.s[0]\n" + "fmla v27.4s, v11.4s, v6.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "fmla v20.4s, v8.4s, v5.s[1]\n" + "fmla v24.4s, v8.4s, v6.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "fmla v21.4s, v9.4s, v5.s[1]\n" + "fmla v25.4s, v9.4s, v6.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "fmla v22.4s, v10.4s, v5.s[1]\n" + "fmla v26.4s, v10.4s, v6.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "fmla v23.4s, v11.4s, v5.s[1]\n" + "fmla v27.4s, v11.4s, v6.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v5.s[2]\n" + "fmla v24.4s, v8.4s, v6.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "fmla v21.4s, v9.4s, v5.s[2]\n" + "fmla v25.4s, v9.4s, v6.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "fmla v22.4s, v10.4s, v5.s[2]\n" + "fmla v26.4s, v10.4s, v6.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "fmla v23.4s, v11.4s, v5.s[2]\n" + "fmla v27.4s, v11.4s, v6.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v20.4s, v8.4s, v5.s[3]\n" + "fmla v24.4s, v8.4s, v6.s[3]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "fmla v21.4s, v9.4s, v5.s[3]\n" + "fmla v25.4s, v9.4s, v6.s[3]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "fmla v22.4s, v10.4s, v5.s[3]\n" + "fmla v26.4s, v10.4s, v6.s[3]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "fmla v23.4s, v11.4s, v5.s[3]\n" + "fmla v27.4s, v11.4s, v6.s[3]\n" + "b 5f\n" + "4:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "fmla v24.4s, v8.4s, v2.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "fmla v25.4s, v9.4s, v2.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "fmla v26.4s, v10.4s, v2.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "fmla v27.4s, v11.4s, v2.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "fmla v24.4s, v8.4s, v2.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "fmla v25.4s, v9.4s, v2.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "fmla v27.4s, v11.4s, v2.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v24.4s, v8.4s, v2.s[3]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v25.4s, v9.4s, v2.s[3]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v26.4s, v10.4s, v2.s[3]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "fmla v27.4s, v11.4s, v2.s[3]\n" + "5:\n" + "cbz %[blocks], 6f\n" + "7:\n" + "ldr q8, [%[b_ptr0]]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr s0, [%[a_ptr0]]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], #0x4\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr s1, [a_ptr1]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "add a_ptr1, a_ptr1, #0x4\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "ldr s2, [a_ptr2]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "add a_ptr2, a_ptr2, #0x4\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "b.ne 7b\n" + "6:\n" + "ld1r {v14.4s}, [%[minptr]]\n" + "ld1r {v15.4s}, [%[maxptr]]\n" + "fmax v16.4s, v16.4s, v14.4s\n" + "fmax v17.4s, v17.4s, v14.4s\n" + "fmax v18.4s, v18.4s, v14.4s\n" + "fmax v19.4s, v19.4s, v14.4s\n" + "fmin v16.4s, v16.4s, v15.4s\n" + "fmin v17.4s, v17.4s, v15.4s\n" + "fmin v18.4s, v18.4s, v15.4s\n" + "fmin v19.4s, v19.4s, v15.4s\n" + "str q16, [%[c_ptr0]]\n" + "fmax v20.4s, v20.4s, v14.4s\n" + "fmax v21.4s, v21.4s, v14.4s\n" + "fmax v22.4s, v22.4s, v14.4s\n" + "str q17, [%[c_ptr0], #0x10]\n" + "fmax v23.4s, v23.4s, v14.4s\n" + "fmin v20.4s, v20.4s, v15.4s\n" + "fmin v21.4s, v21.4s, v15.4s\n" + "str q18, [%[c_ptr0], #0x20]\n" + "fmin v22.4s, v22.4s, v15.4s\n" + "fmin v23.4s, v23.4s, v15.4s\n" + "fmax v24.4s, v24.4s, v14.4s\n" + "str q19, [%[c_ptr0], #0x30]\n" + "fmax v25.4s, v25.4s, v14.4s\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "fmax v26.4s, v26.4s, v14.4s\n" + "str q20, [c_ptr1]\n" + "fmin v24.4s, v24.4s, v15.4s\n" + "fmin v25.4s, v25.4s, v15.4s\n" + "fmax v27.4s, v27.4s, v14.4s\n" + "str q21, [c_ptr1, #0x10]\n" + "fmin v26.4s, v26.4s, v15.4s\n" + "fmin v27.4s, v27.4s, v15.4s\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + "str q24, [c_ptr2]\n" + "str q25, [c_ptr2, #0x10]\n" + "str q26, [c_ptr2, #0x20]\n" + "str q27, [c_ptr2, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "cbnz %[append], 1f\n" + "ldr q16, [%[biasptr]]\n" + "ldr q17, [%[biasptr], #0x10]\n" + "ldr q18, [%[biasptr], #0x20]\n" + "ldr q19, [%[biasptr], #0x30]\n" + "mov v20.16b, v16.16b\n" + "ldr q0, [%[a_ptr0]]\n" + "mov v21.16b, v17.16b\n" + "ldr q1, [a_ptr1]\n" + "mov v22.16b, v18.16b\n" + "ldr q2, [a_ptr2]\n" + "mov v23.16b, v19.16b\n" + "ldr q3, [a_ptr3]\n" + "mov v24.16b, v16.16b\n" + "ldr q8, [%[b_ptr0]]\n" + "mov v25.16b, v17.16b\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "mov v26.16b, v18.16b\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "mov v27.16b, v19.16b\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov v28.16b, v16.16b\n" + "add a_ptr1, a_ptr1, #0x10\n" + "mov v29.16b, v17.16b\n" + "add a_ptr2, a_ptr2, #0x10\n" + "mov v30.16b, v18.16b\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov v31.16b, v19.16b\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "ldr q20, [c_ptr1]\n" + "ldr q21, [c_ptr1, #0x10]\n" + "ldr q22, [c_ptr1, #0x20]\n" + "ldr q23, [c_ptr1, #0x30]\n" + "ldr q24, [c_ptr2]\n" + "ldr q25, [c_ptr2, #0x10]\n" + "ldr q26, [c_ptr2, #0x20]\n" + "ldr q27, [c_ptr2, #0x30]\n" + "ldr q28, [c_ptr3]\n" + "ldr q29, [c_ptr3, #0x10]\n" + "ldr q30, [c_ptr3, #0x20]\n" + "ldr q31, [c_ptr3, #0x30]\n" + "ldr q0, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q1, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q2, [a_ptr2]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ldr q3, [a_ptr3]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "ldr q8, [%[b_ptr0]]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "3:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "ldr q5, [a_ptr1]\n" + "fmla v28.4s, v8.4s, v3.s[0]\n" + "ldr q6, [a_ptr2]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q7, [a_ptr3]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla v29.4s, v9.4s, v3.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla v30.4s, v10.4s, v3.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + "fmla v31.4s, v11.4s, v3.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" + "fmla v24.4s, v8.4s, v2.s[1]\n" + "fmla v28.4s, v8.4s, v3.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "fmla v25.4s, v9.4s, v2.s[1]\n" + "fmla v29.4s, v9.4s, v3.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "fmla v26.4s, v10.4s, v2.s[1]\n" + "fmla v30.4s, v10.4s, v3.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "fmla v27.4s, v11.4s, v2.s[1]\n" + "fmla v31.4s, v11.4s, v3.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "fmla v24.4s, v8.4s, v2.s[2]\n" + "fmla v28.4s, v8.4s, v3.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "fmla v25.4s, v9.4s, v2.s[2]\n" + "fmla v29.4s, v9.4s, v3.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v30.4s, v10.4s, v3.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "fmla v27.4s, v11.4s, v2.s[2]\n" + "fmla v31.4s, v11.4s, v3.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v24.4s, v8.4s, v2.s[3]\n" + "fmla v28.4s, v8.4s, v3.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v25.4s, v9.4s, v2.s[3]\n" + "fmla v29.4s, v9.4s, v3.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v26.4s, v10.4s, v2.s[3]\n" + "fmla v30.4s, v10.4s, v3.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "ldr q1, [a_ptr1, #-0x10]\n" + "fmla v27.4s, v11.4s, v2.s[3]\n" + "ldr q2, [a_ptr2, #-0x10]\n" + "fmla v31.4s, v11.4s, v3.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "ldr q3, [a_ptr3, #-0x10]\n" + "fmla v20.4s, v8.4s, v5.s[0]\n" + "fmla v24.4s, v8.4s, v6.s[0]\n" + "fmla v28.4s, v8.4s, v7.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "fmla v21.4s, v9.4s, v5.s[0]\n" + "fmla v25.4s, v9.4s, v6.s[0]\n" + "fmla v29.4s, v9.4s, v7.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "fmla v22.4s, v10.4s, v5.s[0]\n" + "fmla v26.4s, v10.4s, v6.s[0]\n" + "fmla v30.4s, v10.4s, v7.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "fmla v23.4s, v11.4s, v5.s[0]\n" + "fmla v27.4s, v11.4s, v6.s[0]\n" + "fmla v31.4s, v11.4s, v7.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "fmla v20.4s, v8.4s, v5.s[1]\n" + "fmla v24.4s, v8.4s, v6.s[1]\n" + "fmla v28.4s, v8.4s, v7.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "fmla v21.4s, v9.4s, v5.s[1]\n" + "fmla v25.4s, v9.4s, v6.s[1]\n" + "fmla v29.4s, v9.4s, v7.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "fmla v22.4s, v10.4s, v5.s[1]\n" + "fmla v26.4s, v10.4s, v6.s[1]\n" + "fmla v30.4s, v10.4s, v7.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "fmla v23.4s, v11.4s, v5.s[1]\n" + "fmla v27.4s, v11.4s, v6.s[1]\n" + "fmla v31.4s, v11.4s, v7.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v5.s[2]\n" + "fmla v24.4s, v8.4s, v6.s[2]\n" + "fmla v28.4s, v8.4s, v7.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "fmla v21.4s, v9.4s, v5.s[2]\n" + "fmla v25.4s, v9.4s, v6.s[2]\n" + "fmla v29.4s, v9.4s, v7.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "fmla v22.4s, v10.4s, v5.s[2]\n" + "fmla v26.4s, v10.4s, v6.s[2]\n" + "fmla v30.4s, v10.4s, v7.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "fmla v23.4s, v11.4s, v5.s[2]\n" + "fmla v27.4s, v11.4s, v6.s[2]\n" + "fmla v31.4s, v11.4s, v7.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "fmla v20.4s, v8.4s, v5.s[3]\n" + "fmla v24.4s, v8.4s, v6.s[3]\n" + "fmla v28.4s, v8.4s, v7.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "fmla v21.4s, v9.4s, v5.s[3]\n" + "fmla v25.4s, v9.4s, v6.s[3]\n" + "fmla v29.4s, v9.4s, v7.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "fmla v22.4s, v10.4s, v5.s[3]\n" + "fmla v26.4s, v10.4s, v6.s[3]\n" + "fmla v30.4s, v10.4s, v7.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "fmla v23.4s, v11.4s, v5.s[3]\n" + "fmla v27.4s, v11.4s, v6.s[3]\n" + "fmla v31.4s, v11.4s, v7.s[3]\n" + "b.ne 3b\n" + "2:\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "prfm PSTL1KEEP, [c_ptr2]\n" + "prfm PSTL1KEEP, [c_ptr3]\n" + "cbz %[regs], 4f\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q5, [a_ptr1]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "ldr q6, [a_ptr2]\n" + "fmla v28.4s, v8.4s, v3.s[0]\n" + "ldr q7, [a_ptr3]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "fmla v29.4s, v9.4s, v3.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "fmla v30.4s, v10.4s, v3.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "fmla v31.4s, v11.4s, v3.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "fmla v24.4s, v8.4s, v2.s[1]\n" + "fmla v28.4s, v8.4s, v3.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "fmla v25.4s, v9.4s, v2.s[1]\n" + "fmla v29.4s, v9.4s, v3.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "fmla v26.4s, v10.4s, v2.s[1]\n" + "fmla v30.4s, v10.4s, v3.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "fmla v27.4s, v11.4s, v2.s[1]\n" + "fmla v31.4s, v11.4s, v3.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "fmla v24.4s, v8.4s, v2.s[2]\n" + "fmla v28.4s, v8.4s, v3.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "fmla v25.4s, v9.4s, v2.s[2]\n" + "fmla v29.4s, v9.4s, v3.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v30.4s, v10.4s, v3.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "fmla v27.4s, v11.4s, v2.s[2]\n" + "fmla v31.4s, v11.4s, v3.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v24.4s, v8.4s, v2.s[3]\n" + "fmla v28.4s, v8.4s, v3.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v25.4s, v9.4s, v2.s[3]\n" + "fmla v29.4s, v9.4s, v3.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v26.4s, v10.4s, v2.s[3]\n" + "fmla v30.4s, v10.4s, v3.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "fmla v27.4s, v11.4s, v2.s[3]\n" + "fmla v31.4s, v11.4s, v3.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "fmla v20.4s, v8.4s, v5.s[0]\n" + "fmla v24.4s, v8.4s, v6.s[0]\n" + "fmla v28.4s, v8.4s, v7.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "fmla v21.4s, v9.4s, v5.s[0]\n" + "fmla v25.4s, v9.4s, v6.s[0]\n" + "fmla v29.4s, v9.4s, v7.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "fmla v22.4s, v10.4s, v5.s[0]\n" + "fmla v26.4s, v10.4s, v6.s[0]\n" + "fmla v30.4s, v10.4s, v7.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "fmla v23.4s, v11.4s, v5.s[0]\n" + "fmla v27.4s, v11.4s, v6.s[0]\n" + "fmla v31.4s, v11.4s, v7.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "fmla v20.4s, v8.4s, v5.s[1]\n" + "fmla v24.4s, v8.4s, v6.s[1]\n" + "fmla v28.4s, v8.4s, v7.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "fmla v21.4s, v9.4s, v5.s[1]\n" + "fmla v25.4s, v9.4s, v6.s[1]\n" + "fmla v29.4s, v9.4s, v7.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "fmla v22.4s, v10.4s, v5.s[1]\n" + "fmla v26.4s, v10.4s, v6.s[1]\n" + "fmla v30.4s, v10.4s, v7.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "fmla v23.4s, v11.4s, v5.s[1]\n" + "fmla v27.4s, v11.4s, v6.s[1]\n" + "fmla v31.4s, v11.4s, v7.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v5.s[2]\n" + "fmla v24.4s, v8.4s, v6.s[2]\n" + "fmla v28.4s, v8.4s, v7.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "fmla v21.4s, v9.4s, v5.s[2]\n" + "fmla v25.4s, v9.4s, v6.s[2]\n" + "fmla v29.4s, v9.4s, v7.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "fmla v22.4s, v10.4s, v5.s[2]\n" + "fmla v26.4s, v10.4s, v6.s[2]\n" + "fmla v30.4s, v10.4s, v7.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "fmla v23.4s, v11.4s, v5.s[2]\n" + "fmla v27.4s, v11.4s, v6.s[2]\n" + "fmla v31.4s, v11.4s, v7.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v20.4s, v8.4s, v5.s[3]\n" + "fmla v24.4s, v8.4s, v6.s[3]\n" + "fmla v28.4s, v8.4s, v7.s[3]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "fmla v21.4s, v9.4s, v5.s[3]\n" + "fmla v25.4s, v9.4s, v6.s[3]\n" + "fmla v29.4s, v9.4s, v7.s[3]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "fmla v22.4s, v10.4s, v5.s[3]\n" + "fmla v26.4s, v10.4s, v6.s[3]\n" + "fmla v30.4s, v10.4s, v7.s[3]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "fmla v23.4s, v11.4s, v5.s[3]\n" + "fmla v27.4s, v11.4s, v6.s[3]\n" + "fmla v31.4s, v11.4s, v7.s[3]\n" + "b 5f\n" + "4:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "fmla v28.4s, v8.4s, v3.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "fmla v29.4s, v9.4s, v3.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "fmla v30.4s, v10.4s, v3.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "fmla v31.4s, v11.4s, v3.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "fmla v24.4s, v8.4s, v2.s[1]\n" + "fmla v28.4s, v8.4s, v3.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "fmla v25.4s, v9.4s, v2.s[1]\n" + "fmla v29.4s, v9.4s, v3.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "fmla v26.4s, v10.4s, v2.s[1]\n" + "fmla v30.4s, v10.4s, v3.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "fmla v27.4s, v11.4s, v2.s[1]\n" + "fmla v31.4s, v11.4s, v3.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "fmla v24.4s, v8.4s, v2.s[2]\n" + "fmla v28.4s, v8.4s, v3.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "fmla v25.4s, v9.4s, v2.s[2]\n" + "fmla v29.4s, v9.4s, v3.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v30.4s, v10.4s, v3.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "fmla v27.4s, v11.4s, v2.s[2]\n" + "fmla v31.4s, v11.4s, v3.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v24.4s, v8.4s, v2.s[3]\n" + "fmla v28.4s, v8.4s, v3.s[3]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v25.4s, v9.4s, v2.s[3]\n" + "fmla v29.4s, v9.4s, v3.s[3]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v26.4s, v10.4s, v2.s[3]\n" + "fmla v30.4s, v10.4s, v3.s[3]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "fmla v27.4s, v11.4s, v2.s[3]\n" + "fmla v31.4s, v11.4s, v3.s[3]\n" + "5:\n" + "cbz %[blocks], 6f\n" + "7:\n" + "ldr q8, [%[b_ptr0]]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr s0, [%[a_ptr0]]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], #0x4\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr s1, [a_ptr1]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "add a_ptr1, a_ptr1, #0x4\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "ldr s2, [a_ptr2]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "add a_ptr2, a_ptr2, #0x4\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "ldr s3, [a_ptr3]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "add a_ptr3, a_ptr3, #0x4\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "fmla v28.4s, v8.4s, v3.s[0]\n" + "fmla v29.4s, v9.4s, v3.s[0]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "fmla v30.4s, v10.4s, v3.s[0]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "fmla v31.4s, v11.4s, v3.s[0]\n" + "b.ne 7b\n" + "6:\n" + "ld1r {v14.4s}, [%[minptr]]\n" + "ld1r {v15.4s}, [%[maxptr]]\n" + "fmax v16.4s, v16.4s, v14.4s\n" + "fmax v17.4s, v17.4s, v14.4s\n" + "fmax v18.4s, v18.4s, v14.4s\n" + "fmax v19.4s, v19.4s, v14.4s\n" + "fmin v16.4s, v16.4s, v15.4s\n" + "fmin v17.4s, v17.4s, v15.4s\n" + "fmin v18.4s, v18.4s, v15.4s\n" + "fmin v19.4s, v19.4s, v15.4s\n" + "str q16, [%[c_ptr0]]\n" + "fmax v20.4s, v20.4s, v14.4s\n" + "fmax v21.4s, v21.4s, v14.4s\n" + "fmax v22.4s, v22.4s, v14.4s\n" + "str q17, [%[c_ptr0], #0x10]\n" + "fmax v23.4s, v23.4s, v14.4s\n" + "fmin v20.4s, v20.4s, v15.4s\n" + "fmin v21.4s, v21.4s, v15.4s\n" + "str q18, [%[c_ptr0], #0x20]\n" + "fmin v22.4s, v22.4s, v15.4s\n" + "fmin v23.4s, v23.4s, v15.4s\n" + "fmax v24.4s, v24.4s, v14.4s\n" + "str q19, [%[c_ptr0], #0x30]\n" + "fmax v25.4s, v25.4s, v14.4s\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "fmax v26.4s, v26.4s, v14.4s\n" + "str q20, [c_ptr1]\n" + "fmin v24.4s, v24.4s, v15.4s\n" + "fmin v25.4s, v25.4s, v15.4s\n" + "fmax v27.4s, v27.4s, v14.4s\n" + "str q21, [c_ptr1, #0x10]\n" + "fmin v26.4s, v26.4s, v15.4s\n" + "fmax v28.4s, v28.4s, v14.4s\n" + "fmax v29.4s, v29.4s, v14.4s\n" + "str q22, [c_ptr1, #0x20]\n" + "fmin v27.4s, v27.4s, v15.4s\n" + "fmax v30.4s, v30.4s, v14.4s\n" + "fmin v28.4s, v28.4s, v15.4s\n" + "str q23, [c_ptr1, #0x30]\n" + "fmin v29.4s, v29.4s, v15.4s\n" + "fmax v31.4s, v31.4s, v14.4s\n" + "fmin v30.4s, v30.4s, v15.4s\n" + "str q24, [c_ptr2]\n" + "fmin v31.4s, v31.4s, v15.4s\n" + "str q25, [c_ptr2, #0x10]\n" + "str q26, [c_ptr2, #0x20]\n" + "str q27, [c_ptr2, #0x30]\n" + "str q28, [c_ptr3]\n" + "str q29, [c_ptr3, #0x10]\n" + "str q30, [c_ptr3, #0x20]\n" + "str q31, [c_ptr3, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + } + if (use_result_buffer) { + for(int cy=0; cy 8) { + if (rows_to_compute % 8) { + rows_to_compute = 8 - 1; + } else { + rows_to_compute = 8; + } + } + for (int x0=0; x0 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0get_cpu_model() == CPUModel::X1) { + kernel = a64_interleaved_bf16fp32_dot_12x8_x1; + } + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp index 8ce6a601fd..7ffae524dc 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -57,13 +57,11 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B "movi v12.4s, #0\n" "ldr q2, [%[a_ptr], #0x20]\n" "movi v13.4s, #0\n" - "ldr q6, [%[b_ptr], #0x20]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "movi v14.4s, #0\n" - "ldr q3, [%[a_ptr], #0x30]\n" + "add %[b_ptr], %[b_ptr], #0x30\n" "movi v15.4s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" "movi v16.4s, #0\n" - "add %[b_ptr], %[b_ptr], #0x30\n" "movi v17.4s, #0\n" "movi v18.4s, #0\n" "movi v19.4s, #0\n" @@ -82,9 +80,11 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B "cbz %[loops], 1f\n" "2:\n" ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n" - "subs %[loops], %[loops], #0x1\n" + "ldr q6, [%[b_ptr], #-0x10]\n" ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n" + "subs %[loops], %[loops], #0x1\n" ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n" ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n" ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n" @@ -140,13 +140,13 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n" ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n" ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n" - "ldr q6, [%[b_ptr], #-0x10]\n" - "ldr q3, [%[a_ptr], #-0x10]\n" "b.ne 2b\n" "1:\n" "cbz %[tails], 3f\n" ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n" + "ldr q6, [%[b_ptr], #-0x10]\n" ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n" ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n" ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n" @@ -178,12 +178,13 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n" "add %[a_ptr], %[a_ptr], #0x20\n" ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n" + "add %[b_ptr], %[b_ptr], #0x60\n" ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n" ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n" ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n" ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n" ".inst 0x4f63f897 // bfdot v23.4s, v4.8h, v3.h[3]\n" - "ldr q4, [%[b_ptr], #0x30]\n" + "ldr q4, [%[b_ptr], #-0x30]\n" ".inst 0x4f42f0ac // bfdot v12.4s, v5.8h, v2.h[0]\n" ".inst 0x4f62f0ad // bfdot v13.4s, v5.8h, v2.h[1]\n" ".inst 0x4f42f8ae // bfdot v14.4s, v5.8h, v2.h[2]\n" @@ -192,7 +193,7 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B ".inst 0x4f63f0b9 // bfdot v25.4s, v5.8h, v3.h[1]\n" ".inst 0x4f43f8ba // bfdot v26.4s, v5.8h, v3.h[2]\n" ".inst 0x4f63f8bb // bfdot v27.4s, v5.8h, v3.h[3]\n" - "ldr q5, [%[b_ptr], #0x40]\n" + "ldr q5, [%[b_ptr], #-0x20]\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" ".inst 0x4f62f0d1 // bfdot v17.4s, v6.8h, v2.h[1]\n" ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" @@ -201,13 +202,12 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n" ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n" ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n" - "ldr q6, [%[b_ptr], #0x50]\n" + "ldr q6, [%[b_ptr], #-0x10]\n" ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n" - "add %[b_ptr], %[b_ptr], #0x60\n" ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n" ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n" - "str q8, [%[c_ptr]]\n" ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n" + "str q8, [%[c_ptr]]\n" ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n" ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n" ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n" @@ -234,14 +234,17 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B "b 4f\n" "3:\n" ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n" + "ldr q6, [%[b_ptr], #-0x10]\n" ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n" + "add %[b_ptr], %[b_ptr], #0x30\n" ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n" ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n" ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n" ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n" ".inst 0x4f61f897 // bfdot v23.4s, v4.8h, v1.h[3]\n" - "ldr q4, [%[b_ptr]]\n" + "ldr q4, [%[b_ptr], #-0x30]\n" ".inst 0x4f40f0ac // bfdot v12.4s, v5.8h, v0.h[0]\n" ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n" ".inst 0x4f40f8ae // bfdot v14.4s, v5.8h, v0.h[2]\n" @@ -250,7 +253,7 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n" ".inst 0x4f41f8ba // bfdot v26.4s, v5.8h, v1.h[2]\n" ".inst 0x4f61f8bb // bfdot v27.4s, v5.8h, v1.h[3]\n" - "ldr q5, [%[b_ptr], #0x10]\n" + "ldr q5, [%[b_ptr], #-0x20]\n" ".inst 0x4f40f0d0 // bfdot v16.4s, v6.8h, v0.h[0]\n" ".inst 0x4f60f0d1 // bfdot v17.4s, v6.8h, v0.h[1]\n" ".inst 0x4f40f8d2 // bfdot v18.4s, v6.8h, v0.h[2]\n" @@ -259,13 +262,12 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B ".inst 0x4f61f0dd // bfdot v29.4s, v6.8h, v1.h[1]\n" ".inst 0x4f41f8de // bfdot v30.4s, v6.8h, v1.h[2]\n" ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n" - "ldr q6, [%[b_ptr], #0x20]\n" + "ldr q6, [%[b_ptr], #-0x10]\n" ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n" - "add %[b_ptr], %[b_ptr], #0x30\n" ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n" ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n" - "str q8, [%[c_ptr]]\n" ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n" + "str q8, [%[c_ptr]]\n" ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n" ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n" ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp new file mode 100644 index 0000000000..58a51432fd --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include "../../bfloat.hpp" +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const bfloat16 *a_ptr = Apanel; + float *c_ptr = Cpanel; + + K /= 2; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 12x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm { + +void a64_sgemm_asimd_12x8_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for (int yb=0; yb 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); long loops = loops_count; @@ -79,7 +90,7 @@ void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp index d889f99f8f..551c6f3a8c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_bf16fp32_mmla_4VLx4; - hybrid_bf16fp32_mmla_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + hybrid_bf16fp32_mmla_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp index e3debe508d..4b67d747e2 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -62,12 +62,23 @@ void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *A, int lda, const bfloat16 * break; } - for (int y=0; y 8) { + if (rows_to_compute % 8) { + rows_to_compute = 8 - 1; + } else { + rows_to_compute = 8; + } + } + for (int x0=0; x0())) { const long width = std::min((unsigned long)N-x0, (2 * get_vector_length())); long loops = loops_count; @@ -79,7 +90,7 @@ void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *A, int lda, const bfloat16 * const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp index affcafe4aa..6f26fd1404 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_bf16fp32_mmla_6VLx2; - hybrid_bf16fp32_mmla_6VLx2(const CPUInfo *ci) { UNUSED(ci); } + hybrid_bf16fp32_mmla_6VLx2(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp index 07ecbf35cd..fb943fe6fe 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -62,12 +62,23 @@ void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *A, int lda, const bfloat16 * break; } - for (int y=0; y 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0())) { const long width = std::min((unsigned long)N-x0, (3 * get_vector_length())); long loops = loops_count; @@ -79,7 +90,7 @@ void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *A, int lda, const bfloat16 * const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp index af8babd113..0bf4492fdc 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_bf16fp32_mmla_8VLx2; - hybrid_bf16fp32_mmla_8VLx2(const CPUInfo *ci) { UNUSED(ci); } + hybrid_bf16fp32_mmla_8VLx2(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp index 73196164a7..3f201f0656 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -62,12 +62,23 @@ void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *A, int lda, const bfloat16 * break; } - for (int y=0; y 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); long loops = loops_count; @@ -79,7 +90,7 @@ void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *A, int lda, const bfloat16 * const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp index 28ef8071c2..fb27b7e103 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_fp16_mla_4VLx4; - hybrid_fp16_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + hybrid_fp16_mla_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp index 2998f33d87..3aef916ad2 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,12 +61,23 @@ void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 break; } - for (int y=0; y 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>())); long loops = loops_count; @@ -78,7 +89,7 @@ void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 const unsigned long ldcb = ldc * sizeof(__fp16); const __fp16 *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp index 8e3c17917b..28e00305f7 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_fp32_mla_4VLx4; - hybrid_fp32_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + hybrid_fp32_mla_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp index 855d27a151..6b55959e2a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,12 +61,23 @@ void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C break; } - for (int y=0; y 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); long loops = loops_count; @@ -78,7 +89,7 @@ void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.s, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp new file mode 100644 index 0000000000..4bdf4e1d80 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +#include "../std_transforms_sve.hpp" + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_hybrid_fp32_mmla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); + +class hybrid_fp32_mmla_4VLx4 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + + static unsigned int out_width() + { + return get_vector_length() * 2; + } + + static constexpr unsigned int k_unroll() + { + return 2; + } + + static constexpr bool supports_append() + { + return true; + } + + static constexpr bool supports_bias() + { + return true; + } + + static constexpr bool supports_activation() + { + return true; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_hybrid_fp32_mmla_4VLx4; + + hybrid_fp32_mmla_4VLx4(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp new file mode 100644 index 0000000000..d8ed307c4b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp @@ -0,0 +1,3459 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + +#include "arm_gemm.hpp" + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_hybrid_fp32_mmla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) { + const int K_stride = ((K + 1) / 2) * 2; + const long loops_count = ((K + 4) / 8) - 1; + K -= loops_count * 8; + const long regs_count = (K / 4) - 1; + K -= (regs_count + 1) * 4; + const long leftovers = K; + const long blocks_count = (K + 1) / 2; + float nullbias[128]; + if (!append && !bias) { + memset(nullbias, 0, (2 * get_vector_length() * sizeof(float))); + } + float minval = - static_cast(std::numeric_limits::infinity()); + float maxval = static_cast(std::numeric_limits::infinity()); + const float * const minptr = &minval; + const float * const maxptr = &maxval; + + switch(act.type) + { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + maxval = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + minval = 0.0f; + break; + } + + int rows_to_compute; + + for (int y=0; y 8) { + if (rows_to_compute % 8) { + rows_to_compute = 8 - 1; + } else { + rows_to_compute = 8; + } + } + + for (int x0=0; x0())) { + const long width = std::min((unsigned long)N-x0, (2 * get_vector_length())); + long loops = loops_count; + long regs = regs_count; + long temp = 0; + long blocks = blocks_count; + const float *a_ptr0 = a_ptr0_base; + const float *b_ptr0 = B + (K_stride * x0); + const unsigned long ldcb = ldc * sizeof(float); + const float *biasptr = bias ? bias+x0 : nullbias; + + switch(rows_to_compute) { + case 1: + __asm __volatile ( + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "mov z1.s, #0\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "zip1 z18.s, z15.s, z15.s\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "mov z14.s, #0\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "mov z1.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "mov z14.s, #0\n" + "zip1 z18.s, z13.s, z14.s\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z5.s, #0\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z0.d, z4.d, z5.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "mov z1.s, #0\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z5.s, #0\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z0.d, z4.d, z5.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "mov z1.s, #0\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z5.s, #0\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + "cbz %[blocks], 5f\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp1 z1.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "zip1 z18.s, z15.s, z15.s\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "zip1 z18.s, z13.s, z14.s\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "trn1 z0.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "subs %[loops], %[loops], #0x1\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "trn1 z0.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "addvl a_ptr1, a_ptr1, #2\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + "trn1 z0.d, z4.d, z5.d\n" + "cbz %[blocks], 5f\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "st1w z1.s, p0, [c_ptr1]\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "mov z3.s, #0\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z18.s, z15.s, z15.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "trn1 z9.d, z2.d, z3.d\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "mov z20.d, z16.d\n" + "mov z21.d, z17.d\n" + "mov z22.d, z18.d\n" + "mov z23.d, z19.d\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "mov z3.s, #0\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "zip1 z18.s, z13.s, z14.s\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr2]\n" + "mov z14.s, #0\n" + "zip1 z20.s, z13.s, z14.s\n" + "zip2 z21.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "mov z14.s, #0\n" + "zip1 z22.s, z13.s, z14.s\n" + "zip2 z23.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "add a_ptr1, a_ptr1, #0x20\n" + "trn2 z8.d, z4.d, z5.d\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z7.s, #0\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "trn2 z9.d, z6.d, z7.d\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z1.d, z6.d, z7.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "mov z3.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z9.d, z2.d, z3.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr2, a_ptr2, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn2 z8.d, z4.d, z5.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z7.s, #0\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "trn2 z9.d, z6.d, z7.d\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z1.d, z6.d, z7.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "mov z3.s, #0\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "addvl a_ptr2, a_ptr2, #1\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z7.s, #0\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + "cbz %[blocks], 5f\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "fmax z20.s, p7/m, z20.s, z14.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "fmin z20.s, p7/m, z20.s, z15.s\n" + "fmax z21.s, p7/m, z21.s, z14.s\n" + "fmax z22.s, p7/m, z22.s, z14.s\n" + "st1w z1.s, p0, [c_ptr1]\n" + "fmax z23.s, p7/m, z23.s, z14.s\n" + "fmin z21.s, p7/m, z21.s, z15.s\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "fmin z22.s, p7/m, z22.s, z15.s\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "fmin z23.s, p7/m, z23.s, z15.s\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + "uzp1 z4.s, z20.s, z21.s\n" + "uzp1 z5.s, z22.s, z23.s\n" + "st1w z4.s, p0, [c_ptr2]\n" + "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z18.s, z15.s, z15.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z20.d, z16.d\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z21.d, z17.d\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "mov z22.d, z18.d\n" + "mov z23.d, z19.d\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z18.s, z13.s, z14.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr2]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z14.s, p0/z, [c_ptr3]\n" + "zip1 z20.s, z13.s, z14.s\n" + "zip2 z21.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "zip1 z22.s, z13.s, z14.s\n" + "zip2 z23.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "trn2 z8.d, z4.d, z5.d\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "trn2 z9.d, z6.d, z7.d\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z1.d, z6.d, z7.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z9.d, z2.d, z3.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "trn2 z8.d, z4.d, z5.d\n" + "addvl a_ptr2, a_ptr2, #2\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "trn2 z9.d, z6.d, z7.d\n" + "addvl a_ptr3, a_ptr3, #2\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z1.d, z6.d, z7.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #2\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "addvl a_ptr2, a_ptr2, #1\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl a_ptr3, a_ptr3, #1\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + "cbz %[blocks], 5f\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "fmax z20.s, p7/m, z20.s, z14.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "fmin z20.s, p7/m, z20.s, z15.s\n" + "fmax z21.s, p7/m, z21.s, z14.s\n" + "fmax z22.s, p7/m, z22.s, z14.s\n" + "st1w z1.s, p0, [c_ptr1]\n" + "fmax z23.s, p7/m, z23.s, z14.s\n" + "fmin z21.s, p7/m, z21.s, z15.s\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "fmin z22.s, p7/m, z22.s, z15.s\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "fmin z23.s, p7/m, z23.s, z15.s\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + "uzp1 z4.s, z20.s, z21.s\n" + "uzp2 z5.s, z20.s, z21.s\n" + "uzp1 z6.s, z22.s, z23.s\n" + "st1w z4.s, p0, [c_ptr2]\n" + "uzp2 z7.s, z22.s, z23.s\n" + "st1w z5.s, p0, [c_ptr3]\n" + "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + case 5: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "c_ptr1 .req X4\n" + "c_ptr2 .req X5\n" + "c_ptr3 .req X6\n" + "c_ptr4 .req X7\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "mov z5.s, #0\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z18.s, z15.s, z15.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z20.d, z16.d\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z21.d, z17.d\n" + "add a_ptr4, a_ptr4, #0x10\n" + "mov z22.d, z18.d\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "mov z23.d, z19.d\n" + "mov z24.d, z16.d\n" + "mov z25.d, z17.d\n" + "mov z26.d, z18.d\n" + "mov z27.d, z19.d\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "mov z5.s, #0\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z18.s, z13.s, z14.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr2]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z14.s, p0/z, [c_ptr3]\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add a_ptr4, a_ptr4, #0x10\n" + "zip1 z20.s, z13.s, z14.s\n" + "zip2 z21.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "zip1 z22.s, z13.s, z14.s\n" + "zip2 z23.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr4]\n" + "mov z14.s, #0\n" + "zip1 z24.s, z13.s, z14.s\n" + "zip2 z25.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" + "mov z14.s, #0\n" + "zip1 z26.s, z13.s, z14.s\n" + "zip2 z27.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z9.s, #0\n" + "add a_ptr4, a_ptr4, #0x20\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "trn2 z10.d, z8.d, z9.d\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z2.d, z8.d, z9.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z9.d, z6.d, z7.d\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "mov z5.s, #0\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z10.d, z4.d, z5.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr3, a_ptr3, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z9.s, #0\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "trn2 z10.d, z8.d, z9.d\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z2.d, z8.d, z9.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z9.d, z6.d, z7.d\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "addvl a_ptr4, a_ptr4, #2\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "mov z5.s, #0\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #2\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr2, a_ptr2, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "trn1 z10.d, z4.d, z5.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p6/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "addvl a_ptr2, a_ptr2, #1\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "addvl a_ptr3, a_ptr3, #1\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "addvl a_ptr4, a_ptr4, #1\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z9.s, #0\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + "cbz %[blocks], 5f\n" + "trn2 z10.d, z8.d, z9.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "fmax z20.s, p7/m, z20.s, z14.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "fmin z20.s, p7/m, z20.s, z15.s\n" + "fmax z21.s, p7/m, z21.s, z14.s\n" + "fmax z22.s, p7/m, z22.s, z14.s\n" + "st1w z1.s, p0, [c_ptr1]\n" + "fmax z23.s, p7/m, z23.s, z14.s\n" + "fmax z24.s, p7/m, z24.s, z14.s\n" + "fmin z21.s, p7/m, z21.s, z15.s\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "fmin z22.s, p7/m, z22.s, z15.s\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "fmin z23.s, p7/m, z23.s, z15.s\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + "uzp1 z4.s, z20.s, z21.s\n" + "uzp2 z5.s, z20.s, z21.s\n" + "fmin z24.s, p7/m, z24.s, z15.s\n" + "uzp1 z6.s, z22.s, z23.s\n" + "st1w z4.s, p0, [c_ptr2]\n" + "uzp2 z7.s, z22.s, z23.s\n" + "fmax z25.s, p7/m, z25.s, z14.s\n" + "fmax z26.s, p7/m, z26.s, z14.s\n" + "st1w z5.s, p0, [c_ptr3]\n" + "fmax z27.s, p7/m, z27.s, z14.s\n" + "fmin z25.s, p7/m, z25.s, z15.s\n" + "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" + "fmin z26.s, p7/m, z26.s, z15.s\n" + "fmin z27.s, p7/m, z27.s, z15.s\n" + "uzp1 z8.s, z24.s, z25.s\n" + "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" + "uzp1 z9.s, z26.s, z27.s\n" + "st1w z8.s, p0, [c_ptr4]\n" + "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory" + ); + break; + case 6: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "c_ptr1 .req X5\n" + "c_ptr2 .req X6\n" + "c_ptr3 .req X7\n" + "c_ptr4 .req X8\n" + "c_ptr5 .req X9\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z18.s, z15.s, z15.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z20.d, z16.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z21.d, z17.d\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z22.d, z18.d\n" + "add a_ptr4, a_ptr4, #0x10\n" + "mov z23.d, z19.d\n" + "add a_ptr5, a_ptr5, #0x10\n" + "mov z24.d, z16.d\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "mov z25.d, z17.d\n" + "mov z26.d, z18.d\n" + "mov z27.d, z19.d\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z18.s, z13.s, z14.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr2]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z14.s, p0/z, [c_ptr3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr4, a_ptr4, #0x10\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "zip1 z20.s, z13.s, z14.s\n" + "add a_ptr5, a_ptr5, #0x10\n" + "zip2 z21.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "zip1 z22.s, z13.s, z14.s\n" + "zip2 z23.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr4]\n" + "ld1w z14.s, p0/z, [c_ptr5]\n" + "zip1 z24.s, z13.s, z14.s\n" + "zip2 z25.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n" + "zip1 z26.s, z13.s, z14.s\n" + "zip2 z27.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p7/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "trn2 z10.d, z8.d, z9.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "add a_ptr4, a_ptr4, #0x20\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "add a_ptr5, a_ptr5, #0x20\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z2.d, z8.d, z9.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z9.d, z6.d, z7.d\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z10.d, z4.d, z5.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr3, a_ptr3, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p7/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "trn2 z10.d, z8.d, z9.d\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z2.d, z8.d, z9.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z9.d, z6.d, z7.d\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "addvl a_ptr4, a_ptr4, #2\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl a_ptr5, a_ptr5, #2\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "addvl a_ptr2, a_ptr2, #2\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "trn1 z10.d, z4.d, z5.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p6/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "addvl a_ptr2, a_ptr2, #1\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "addvl a_ptr3, a_ptr3, #1\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "addvl a_ptr4, a_ptr4, #1\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p6/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "addvl a_ptr5, a_ptr5, #1\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + "cbz %[blocks], 5f\n" + "trn2 z10.d, z8.d, z9.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "fmax z20.s, p7/m, z20.s, z14.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "fmin z20.s, p7/m, z20.s, z15.s\n" + "fmax z21.s, p7/m, z21.s, z14.s\n" + "fmax z22.s, p7/m, z22.s, z14.s\n" + "st1w z1.s, p0, [c_ptr1]\n" + "fmax z23.s, p7/m, z23.s, z14.s\n" + "fmax z24.s, p7/m, z24.s, z14.s\n" + "fmin z21.s, p7/m, z21.s, z15.s\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "fmin z22.s, p7/m, z22.s, z15.s\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "fmin z23.s, p7/m, z23.s, z15.s\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + "uzp1 z4.s, z20.s, z21.s\n" + "uzp2 z5.s, z20.s, z21.s\n" + "fmin z24.s, p7/m, z24.s, z15.s\n" + "uzp1 z6.s, z22.s, z23.s\n" + "st1w z4.s, p0, [c_ptr2]\n" + "uzp2 z7.s, z22.s, z23.s\n" + "fmax z25.s, p7/m, z25.s, z14.s\n" + "fmax z26.s, p7/m, z26.s, z14.s\n" + "st1w z5.s, p0, [c_ptr3]\n" + "fmax z27.s, p7/m, z27.s, z14.s\n" + "fmin z25.s, p7/m, z25.s, z15.s\n" + "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" + "fmin z26.s, p7/m, z26.s, z15.s\n" + "fmin z27.s, p7/m, z27.s, z15.s\n" + "uzp1 z8.s, z24.s, z25.s\n" + "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" + "uzp2 z9.s, z24.s, z25.s\n" + "uzp1 z10.s, z26.s, z27.s\n" + "uzp2 z11.s, z26.s, z27.s\n" + "st1w z8.s, p0, [c_ptr4]\n" + "st1w z9.s, p0, [c_ptr5]\n" + "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n" + "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory" + ); + break; + case 7: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "c_ptr1 .req X6\n" + "c_ptr2 .req X7\n" + "c_ptr3 .req X8\n" + "c_ptr4 .req X9\n" + "c_ptr5 .req X10\n" + "c_ptr6 .req X11\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "mov z7.s, #0\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z18.s, z15.s, z15.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "mov z20.d, z16.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn1 z11.d, z6.d, z7.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z21.d, z17.d\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z22.d, z18.d\n" + "add a_ptr4, a_ptr4, #0x10\n" + "mov z23.d, z19.d\n" + "add a_ptr5, a_ptr5, #0x10\n" + "mov z24.d, z16.d\n" + "add a_ptr6, a_ptr6, #0x10\n" + "mov z25.d, z17.d\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "mov z26.d, z18.d\n" + "mov z27.d, z19.d\n" + "mov z28.d, z16.d\n" + "mov z29.d, z17.d\n" + "mov z30.d, z18.d\n" + "mov z31.d, z19.d\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "mov z7.s, #0\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z18.s, z13.s, z14.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr2]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z14.s, p0/z, [c_ptr3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "add a_ptr4, a_ptr4, #0x10\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip1 z20.s, z13.s, z14.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "zip2 z21.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "trn1 z11.d, z6.d, z7.d\n" + "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "add a_ptr5, a_ptr5, #0x10\n" + "add a_ptr6, a_ptr6, #0x10\n" + "zip1 z22.s, z13.s, z14.s\n" + "zip2 z23.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr4]\n" + "ld1w z14.s, p0/z, [c_ptr5]\n" + "zip1 z24.s, z13.s, z14.s\n" + "zip2 z25.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n" + "zip1 z26.s, z13.s, z14.s\n" + "zip2 z27.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr6]\n" + "mov z14.s, #0\n" + "zip1 z28.s, z13.s, z14.s\n" + "zip2 z29.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n" + "mov z14.s, #0\n" + "zip1 z30.s, z13.s, z14.s\n" + "zip2 z31.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "subs %[loops], %[loops], #0x1\n" + "trn2 z1.d, z2.d, z3.d\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p7/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "add a_ptr4, a_ptr4, #0x20\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "add a_ptr5, a_ptr5, #0x20\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1rqw z10.s, p7/z, [a_ptr6]\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z11.s, #0\n" + "add a_ptr6, a_ptr6, #0x20\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z3.d, z10.d, z11.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z11.d, z10.d, z11.d\n" + "trn2 z10.d, z8.d, z9.d\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "mov z7.s, #0\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "trn1 z10.d, z4.d, z5.d\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z11.d, z6.d, z7.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "trn2 z1.d, z2.d, z3.d\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p7/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1rqw z10.s, p7/z, [a_ptr6]\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z11.s, #0\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z3.d, z10.d, z11.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z11.d, z10.d, z11.d\n" + "trn2 z10.d, z8.d, z9.d\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "addvl a_ptr4, a_ptr4, #2\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl a_ptr5, a_ptr5, #2\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "addvl a_ptr6, a_ptr6, #2\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "addvl a_ptr1, a_ptr1, #2\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "mov z7.s, #0\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl a_ptr2, a_ptr2, #2\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr3, a_ptr3, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "trn1 z10.d, z4.d, z5.d\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "trn1 z11.d, z6.d, z7.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "trn2 z1.d, z2.d, z3.d\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p6/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "addvl a_ptr2, a_ptr2, #1\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "addvl a_ptr3, a_ptr3, #1\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p6/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "addvl a_ptr4, a_ptr4, #1\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "addvl a_ptr5, a_ptr5, #1\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1rqw z10.s, p6/z, [a_ptr6]\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z11.s, #0\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "addvl a_ptr6, a_ptr6, #1\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "trn1 z3.d, z10.d, z11.d\n" + "cbz %[blocks], 5f\n" + "trn2 z11.d, z10.d, z11.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z10.d, z8.d, z9.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "fmax z20.s, p7/m, z20.s, z14.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "fmin z20.s, p7/m, z20.s, z15.s\n" + "fmax z21.s, p7/m, z21.s, z14.s\n" + "fmax z22.s, p7/m, z22.s, z14.s\n" + "st1w z1.s, p0, [c_ptr1]\n" + "fmax z23.s, p7/m, z23.s, z14.s\n" + "fmax z24.s, p7/m, z24.s, z14.s\n" + "fmin z21.s, p7/m, z21.s, z15.s\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "fmin z22.s, p7/m, z22.s, z15.s\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "fmin z23.s, p7/m, z23.s, z15.s\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + "uzp1 z4.s, z20.s, z21.s\n" + "uzp2 z5.s, z20.s, z21.s\n" + "fmin z24.s, p7/m, z24.s, z15.s\n" + "uzp1 z6.s, z22.s, z23.s\n" + "st1w z4.s, p0, [c_ptr2]\n" + "uzp2 z7.s, z22.s, z23.s\n" + "fmax z25.s, p7/m, z25.s, z14.s\n" + "fmax z26.s, p7/m, z26.s, z14.s\n" + "st1w z5.s, p0, [c_ptr3]\n" + "fmax z27.s, p7/m, z27.s, z14.s\n" + "fmax z28.s, p7/m, z28.s, z14.s\n" + "fmin z25.s, p7/m, z25.s, z15.s\n" + "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" + "fmin z26.s, p7/m, z26.s, z15.s\n" + "fmin z27.s, p7/m, z27.s, z15.s\n" + "fmin z28.s, p7/m, z28.s, z15.s\n" + "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" + "uzp1 z8.s, z24.s, z25.s\n" + "uzp2 z9.s, z24.s, z25.s\n" + "uzp1 z10.s, z26.s, z27.s\n" + "uzp2 z11.s, z26.s, z27.s\n" + "st1w z8.s, p0, [c_ptr4]\n" + "fmax z29.s, p7/m, z29.s, z14.s\n" + "fmax z30.s, p7/m, z30.s, z14.s\n" + "fmax z31.s, p7/m, z31.s, z14.s\n" + "st1w z9.s, p0, [c_ptr5]\n" + "fmin z29.s, p7/m, z29.s, z15.s\n" + "fmin z30.s, p7/m, z30.s, z15.s\n" + "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n" + "fmin z31.s, p7/m, z31.s, z15.s\n" + "uzp1 z12.s, z28.s, z29.s\n" + "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n" + "uzp1 z13.s, z30.s, z31.s\n" + "st1w z12.s, p0, [c_ptr6]\n" + "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory" + ); + break; + default: + case 8: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z18.s, z15.s, z15.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z20.d, z16.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z21.d, z17.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn1 z11.d, z6.d, z7.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z22.d, z18.d\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z23.d, z19.d\n" + "add a_ptr4, a_ptr4, #0x10\n" + "mov z24.d, z16.d\n" + "add a_ptr5, a_ptr5, #0x10\n" + "mov z25.d, z17.d\n" + "add a_ptr6, a_ptr6, #0x10\n" + "mov z26.d, z18.d\n" + "add a_ptr7, a_ptr7, #0x10\n" + "mov z27.d, z19.d\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "mov z28.d, z16.d\n" + "mov z29.d, z17.d\n" + "mov z30.d, z18.d\n" + "mov z31.d, z19.d\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z18.s, z13.s, z14.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr2]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z14.s, p0/z, [c_ptr3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "add a_ptr4, a_ptr4, #0x10\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "zip1 z20.s, z13.s, z14.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip2 z21.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "add a_ptr5, a_ptr5, #0x10\n" + "trn1 z11.d, z6.d, z7.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add a_ptr6, a_ptr6, #0x10\n" + "zip1 z22.s, z13.s, z14.s\n" + "add a_ptr7, a_ptr7, #0x10\n" + "zip2 z23.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr4]\n" + "ld1w z14.s, p0/z, [c_ptr5]\n" + "zip1 z24.s, z13.s, z14.s\n" + "zip2 z25.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n" + "zip1 z26.s, z13.s, z14.s\n" + "zip2 z27.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr6]\n" + "ld1w z14.s, p0/z, [c_ptr7]\n" + "zip1 z28.s, z13.s, z14.s\n" + "zip2 z29.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n" + "zip1 z30.s, z13.s, z14.s\n" + "zip2 z31.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "subs %[loops], %[loops], #0x1\n" + "trn2 z1.d, z2.d, z3.d\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p7/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "add a_ptr4, a_ptr4, #0x20\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "add a_ptr5, a_ptr5, #0x20\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1rqw z10.s, p7/z, [a_ptr6]\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z11.s, p7/z, [a_ptr7]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "add a_ptr6, a_ptr6, #0x20\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "add a_ptr7, a_ptr7, #0x20\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z3.d, z10.d, z11.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z11.d, z10.d, z11.d\n" + "trn2 z10.d, z8.d, z9.d\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #-0x10]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "trn1 z10.d, z4.d, z5.d\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z11.d, z6.d, z7.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "trn2 z1.d, z2.d, z3.d\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p7/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1rqw z10.s, p7/z, [a_ptr6]\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z11.s, p7/z, [a_ptr7]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z3.d, z10.d, z11.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z11.d, z10.d, z11.d\n" + "trn2 z10.d, z8.d, z9.d\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl a_ptr4, a_ptr4, #2\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "addvl a_ptr5, a_ptr5, #2\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "addvl a_ptr6, a_ptr6, #2\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "addvl a_ptr1, a_ptr1, #2\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "addvl a_ptr7, a_ptr7, #2\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr2, a_ptr2, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "addvl a_ptr3, a_ptr3, #2\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "trn1 z10.d, z4.d, z5.d\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "trn1 z11.d, z6.d, z7.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "trn2 z1.d, z2.d, z3.d\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p6/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "addvl a_ptr2, a_ptr2, #1\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "addvl a_ptr3, a_ptr3, #1\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p6/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "addvl a_ptr4, a_ptr4, #1\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "addvl a_ptr5, a_ptr5, #1\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1rqw z10.s, p6/z, [a_ptr6]\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z11.s, p6/z, [a_ptr7]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl a_ptr6, a_ptr6, #1\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "addvl a_ptr7, a_ptr7, #1\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "trn1 z3.d, z10.d, z11.d\n" + "cbz %[blocks], 5f\n" + "trn2 z11.d, z10.d, z11.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z10.d, z8.d, z9.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "fmax z20.s, p7/m, z20.s, z14.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "fmin z20.s, p7/m, z20.s, z15.s\n" + "fmax z21.s, p7/m, z21.s, z14.s\n" + "fmax z22.s, p7/m, z22.s, z14.s\n" + "st1w z1.s, p0, [c_ptr1]\n" + "fmax z23.s, p7/m, z23.s, z14.s\n" + "fmax z24.s, p7/m, z24.s, z14.s\n" + "fmin z21.s, p7/m, z21.s, z15.s\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "fmin z22.s, p7/m, z22.s, z15.s\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "fmin z23.s, p7/m, z23.s, z15.s\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + "uzp1 z4.s, z20.s, z21.s\n" + "uzp2 z5.s, z20.s, z21.s\n" + "fmin z24.s, p7/m, z24.s, z15.s\n" + "uzp1 z6.s, z22.s, z23.s\n" + "st1w z4.s, p0, [c_ptr2]\n" + "uzp2 z7.s, z22.s, z23.s\n" + "fmax z25.s, p7/m, z25.s, z14.s\n" + "fmax z26.s, p7/m, z26.s, z14.s\n" + "st1w z5.s, p0, [c_ptr3]\n" + "fmax z27.s, p7/m, z27.s, z14.s\n" + "fmax z28.s, p7/m, z28.s, z14.s\n" + "fmin z25.s, p7/m, z25.s, z15.s\n" + "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" + "fmin z26.s, p7/m, z26.s, z15.s\n" + "fmin z27.s, p7/m, z27.s, z15.s\n" + "fmin z28.s, p7/m, z28.s, z15.s\n" + "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" + "uzp1 z8.s, z24.s, z25.s\n" + "uzp2 z9.s, z24.s, z25.s\n" + "uzp1 z10.s, z26.s, z27.s\n" + "uzp2 z11.s, z26.s, z27.s\n" + "st1w z8.s, p0, [c_ptr4]\n" + "fmax z29.s, p7/m, z29.s, z14.s\n" + "fmax z30.s, p7/m, z30.s, z14.s\n" + "fmax z31.s, p7/m, z31.s, z14.s\n" + "st1w z9.s, p0, [c_ptr5]\n" + "fmin z29.s, p7/m, z29.s, z15.s\n" + "fmin z30.s, p7/m, z30.s, z15.s\n" + "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n" + "fmin z31.s, p7/m, z31.s, z15.s\n" + "uzp1 z12.s, z28.s, z29.s\n" + "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n" + "uzp2 z13.s, z28.s, z29.s\n" + "uzp1 z14.s, z30.s, z31.s\n" + "uzp2 z15.s, z30.s, z31.s\n" + "st1w z12.s, p0, [c_ptr6]\n" + "st1w z13.s, p0, [c_ptr7]\n" + "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n" + "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory" + ); + break; + } + + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp index d8422105cc..230a2cf19f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_s8s32_dot_4VLx4; - hybrid_s8s32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + hybrid_s8s32_dot_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp index aa3a764dec..46fc500476 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp @@ -32,7 +32,7 @@ namespace arm_gemm { -void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool append) { +void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool append) { const int K_stride = ((K + 3) / 4) * 4; const long loops_count = ((K + 16) / 32) - 1; K -= loops_count * 32; @@ -41,12 +41,23 @@ void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32 const long leftovers = K; const long blocks_count = (K + 3) / 4; - for (int y=0; y 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); long loops = loops_count; @@ -57,7 +68,7 @@ void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32 const int8_t *b_ptr0 = B + (K_stride * x0); const unsigned long ldcb = ldc * sizeof(int32_t); - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.b, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp index 5dab1da135..f829fb0205 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_u8u32_dot_4VLx4; - hybrid_u8u32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + hybrid_u8u32_dot_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp index 4fb7e825b5..13614700e3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp @@ -32,7 +32,7 @@ namespace arm_gemm { -void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool append) { +void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool append) { const int K_stride = ((K + 3) / 4) * 4; const long loops_count = ((K + 16) / 32) - 1; K -= loops_count * 32; @@ -41,12 +41,23 @@ void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uin const long leftovers = K; const long blocks_count = (K + 3) / 4; - for (int y=0; y 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); long loops = loops_count; @@ -57,7 +68,7 @@ void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uin const uint8_t *b_ptr0 = B + (K_stride * x0); const unsigned long ldcb = ldc * sizeof(uint32_t); - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.b, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp index a3434c1504..43107e45fa 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,7 +61,10 @@ public: kern_type kernel=sve_interleaved_bf16fp32_dot_3VLx8; - interleaved_bf16fp32_dot_3VLx8(const CPUInfo *ci) { UNUSED(ci); } + interleaved_bf16fp32_dot_3VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp index 65841581aa..7e20ed0971 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,13 +61,11 @@ void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 * "mov z15.s, #0\n" "ld1rqh z2.h, p0/z, [%[a_ptr], #0x20]\n" "mov z16.s, #0\n" - "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "mov z17.s, #0\n" - "ld1rqh z3.h, p0/z, [%[a_ptr], #0x30]\n" + "addvl %[b_ptr], %[b_ptr], #3\n" "mov z18.s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" "mov z19.s, #0\n" - "addvl %[b_ptr], %[b_ptr], #3\n" "mov z20.s, #0\n" "mov z21.s, #0\n" "mov z22.s, #0\n" @@ -83,9 +81,11 @@ void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 * "cbz %[loops], 1f\n" "2:\n" ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n" - "subs %[loops], %[loops], #0x1\n" + "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n" + "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n" + "subs %[loops], %[loops], #0x1\n" ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n" ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n" ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n" @@ -141,13 +141,13 @@ void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 * ".inst 0x646b40dd // bfdot z29.s, z6.h, z3.h[1]\n" ".inst 0x647340de // bfdot z30.s, z6.h, z3.h[2]\n" ".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n" - "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" - "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" "b.ne 2b\n" "1:\n" "cbz %[tails], 3f\n" ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n" + "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n" + "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n" ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n" ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n" @@ -235,9 +235,11 @@ void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 * "b 4f\n" "3:\n" ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n" - "addvl %[b_ptr], %[b_ptr], #3\n" + "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n" + "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n" + "addvl %[b_ptr], %[b_ptr], #3\n" ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n" ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n" ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp index c6ffc047fd..f1353e2086 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,7 +61,10 @@ public: kern_type kernel=sve_interleaved_bf16fp32_mmla_3VLx8; - interleaved_bf16fp32_mmla_3VLx8(const CPUInfo *ci) { UNUSED(ci); } + interleaved_bf16fp32_mmla_3VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp index 528fc72005..16cc69b2a6 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -63,13 +63,11 @@ void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16 "mov z16.s, #0\n" "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n" "mov z17.s, #0\n" - "ld1rqh z3.h, p0/z, [%[a_ptr], #0x30]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "mov z18.s, #0\n" - "ld1h z7.h, p0/z, [%[b_ptr], #3, MUL VL]\n" + "addvl %[b_ptr], %[b_ptr], #4\n" "mov z19.s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" "mov z20.s, #0\n" - "addvl %[b_ptr], %[b_ptr], #4\n" "mov z21.s, #0\n" "mov z22.s, #0\n" "mov z23.s, #0\n" @@ -84,12 +82,14 @@ void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16 "cbz %[loops], 1f\n" "2:\n" ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - "subs %[loops], %[loops], #0x1\n" + "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n" + "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n" ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" "ld1h z4.h, p0/z, [%[b_ptr]]\n" - ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n" ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n" ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n" ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n" @@ -152,18 +152,18 @@ void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16 ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n" "ld1rqh z2.h, p0/z, [%[a_ptr], #-0x20]\n" ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n" - "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" - "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" "b.ne 2b\n" "1:\n" "cbz %[tails], 3f\n" ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" + "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n" + "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" - ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" - "ld1h z4.h, p0/z, [%[b_ptr]]\n" ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n" ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n" + ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" + "ld1h z4.h, p0/z, [%[b_ptr]]\n" ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n" ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n" "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n" @@ -269,15 +269,17 @@ void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16 "b 4f\n" "3:\n" ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n" - "addvl %[b_ptr], %[b_ptr], #8\n" + "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" - ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" + "add %[a_ptr], %[a_ptr], #0x40\n" ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n" - "ld1h z4.h, p0/z, [%[b_ptr], #-8, MUL VL]\n" + "addvl %[b_ptr], %[b_ptr], #8\n" + ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n" ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n" + "ld1h z4.h, p0/z, [%[b_ptr], #-8, MUL VL]\n" ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n" "ld1h z5.h, p0/z, [%[b_ptr], #-7, MUL VL]\n" ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp index 10dbdd8847..816c0cd095 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,7 +61,10 @@ public: kern_type kernel=sve_interleaved_fp16_mla_3VLx8; - interleaved_fp16_mla_3VLx8(const CPUInfo *ci) { UNUSED(ci); } + interleaved_fp16_mla_3VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp index b2d3a6f52e..f2050cbd56 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,22 +50,22 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "mov z9.h, #0\n" "mov z10.h, #0\n" "mov z11.h, #0\n" - "mov z12.h, #0\n" "ld1rqh z0.h, p0/z, [%[a_ptr]]\n" - "mov z13.h, #0\n" + "mov z12.h, #0\n" "ld1h z2.h, p0/z, [%[b_ptr]]\n" - "mov z14.h, #0\n" + "mov z13.h, #0\n" "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n" - "mov z15.h, #0\n" + "mov z14.h, #0\n" "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n" - "mov z16.h, #0\n" + "mov z15.h, #0\n" "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n" - "mov z17.h, #0\n" + "mov z16.h, #0\n" "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n" - "mov z18.h, #0\n" + "mov z17.h, #0\n" "add %[a_ptr], %[a_ptr], #0x20\n" - "mov z19.h, #0\n" + "mov z18.h, #0\n" "addvl %[b_ptr], %[b_ptr], #6\n" + "mov z19.h, #0\n" "mov z20.h, #0\n" "mov z21.h, #0\n" "mov z22.h, #0\n" @@ -202,8 +202,8 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z9.h, z2.h, z0.h[1]\n" "fmla z10.h, z2.h, z0.h[2]\n" "fmla z11.h, z2.h, z0.h[3]\n" - "fmla z12.h, z2.h, z0.h[4]\n" "st1h z8.h, p0, [%[c_ptr]]\n" + "fmla z12.h, z2.h, z0.h[4]\n" "fmla z13.h, z2.h, z0.h[5]\n" "fmla z14.h, z2.h, z0.h[6]\n" "fmla z15.h, z2.h, z0.h[7]\n" @@ -211,8 +211,8 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z17.h, z3.h, z0.h[1]\n" "fmla z18.h, z3.h, z0.h[2]\n" "fmla z19.h, z3.h, z0.h[3]\n" - "fmla z20.h, z3.h, z0.h[4]\n" "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n" + "fmla z20.h, z3.h, z0.h[4]\n" "fmla z21.h, z3.h, z0.h[5]\n" "fmla z22.h, z3.h, z0.h[6]\n" "fmla z23.h, z3.h, z0.h[7]\n" @@ -220,10 +220,11 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z25.h, z4.h, z0.h[1]\n" "fmla z26.h, z4.h, z0.h[2]\n" "fmla z27.h, z4.h, z0.h[3]\n" - "fmla z28.h, z4.h, z0.h[4]\n" "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n" + "fmla z28.h, z4.h, z0.h[4]\n" "fmla z29.h, z4.h, z0.h[5]\n" "fmla z30.h, z4.h, z0.h[6]\n" + "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z31.h, z4.h, z0.h[7]\n" "b 4f\n" "3:\n" @@ -257,8 +258,8 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z9.h, z5.h, z1.h[1]\n" "fmla z10.h, z5.h, z1.h[2]\n" "fmla z11.h, z5.h, z1.h[3]\n" - "fmla z12.h, z5.h, z1.h[4]\n" "st1h z8.h, p0, [%[c_ptr]]\n" + "fmla z12.h, z5.h, z1.h[4]\n" "fmla z13.h, z5.h, z1.h[5]\n" "fmla z14.h, z5.h, z1.h[6]\n" "fmla z15.h, z5.h, z1.h[7]\n" @@ -266,8 +267,8 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z17.h, z6.h, z1.h[1]\n" "fmla z18.h, z6.h, z1.h[2]\n" "fmla z19.h, z6.h, z1.h[3]\n" - "fmla z20.h, z6.h, z1.h[4]\n" "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n" + "fmla z20.h, z6.h, z1.h[4]\n" "fmla z21.h, z6.h, z1.h[5]\n" "fmla z22.h, z6.h, z1.h[6]\n" "fmla z23.h, z6.h, z1.h[7]\n" @@ -275,13 +276,13 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z25.h, z7.h, z1.h[1]\n" "fmla z26.h, z7.h, z1.h[2]\n" "fmla z27.h, z7.h, z1.h[3]\n" - "fmla z28.h, z7.h, z1.h[4]\n" "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n" + "fmla z28.h, z7.h, z1.h[4]\n" "fmla z29.h, z7.h, z1.h[5]\n" "fmla z30.h, z7.h, z1.h[6]\n" + "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z31.h, z7.h, z1.h[7]\n" "4:\n" - "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n" "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n" "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n" "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp index cdc9447701..cce90fb135 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,7 +61,10 @@ public: kern_type kernel=sve_interleaved_fp32_mla_3VLx8; - interleaved_fp32_mla_3VLx8(const CPUInfo *ci) { UNUSED(ci); } + interleaved_fp32_mla_3VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp index d26948a0d4..cd178c478a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,20 +50,20 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "mov z9.s, #0\n" "mov z10.s, #0\n" "mov z11.s, #0\n" - "mov z12.s, #0\n" "ld1rqw z0.s, p0/z, [%[a_ptr]]\n" - "mov z13.s, #0\n" + "mov z12.s, #0\n" "ld1w z4.s, p0/z, [%[b_ptr]]\n" - "mov z14.s, #0\n" + "mov z13.s, #0\n" "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n" - "mov z15.s, #0\n" + "mov z14.s, #0\n" "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n" - "mov z16.s, #0\n" + "mov z15.s, #0\n" "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n" - "mov z17.s, #0\n" + "mov z16.s, #0\n" "add %[a_ptr], %[a_ptr], #0x40\n" - "mov z18.s, #0\n" + "mov z17.s, #0\n" "addvl %[b_ptr], %[b_ptr], #3\n" + "mov z18.s, #0\n" "mov z19.s, #0\n" "mov z20.s, #0\n" "mov z21.s, #0\n" @@ -207,8 +207,8 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z9.s, z4.s, z0.s[1]\n" "fmla z10.s, z4.s, z0.s[2]\n" "fmla z11.s, z4.s, z0.s[3]\n" - "fmla z20.s, z4.s, z1.s[0]\n" "st1w z8.s, p0, [%[c_ptr]]\n" + "fmla z20.s, z4.s, z1.s[0]\n" "fmla z21.s, z4.s, z1.s[1]\n" "fmla z22.s, z4.s, z1.s[2]\n" "fmla z23.s, z4.s, z1.s[3]\n" @@ -216,8 +216,8 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z13.s, z5.s, z0.s[1]\n" "fmla z14.s, z5.s, z0.s[2]\n" "fmla z15.s, z5.s, z0.s[3]\n" - "fmla z24.s, z5.s, z1.s[0]\n" "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" + "fmla z24.s, z5.s, z1.s[0]\n" "fmla z25.s, z5.s, z1.s[1]\n" "fmla z26.s, z5.s, z1.s[2]\n" "fmla z27.s, z5.s, z1.s[3]\n" @@ -225,10 +225,11 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z17.s, z6.s, z0.s[1]\n" "fmla z18.s, z6.s, z0.s[2]\n" "fmla z19.s, z6.s, z0.s[3]\n" - "fmla z28.s, z6.s, z1.s[0]\n" "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" + "fmla z28.s, z6.s, z1.s[0]\n" "fmla z29.s, z6.s, z1.s[1]\n" "fmla z30.s, z6.s, z1.s[2]\n" + "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z31.s, z6.s, z1.s[3]\n" "b 4f\n" "3:\n" @@ -266,8 +267,8 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z9.s, z4.s, z2.s[1]\n" "fmla z10.s, z4.s, z2.s[2]\n" "fmla z11.s, z4.s, z2.s[3]\n" - "fmla z20.s, z4.s, z3.s[0]\n" "st1w z8.s, p0, [%[c_ptr]]\n" + "fmla z20.s, z4.s, z3.s[0]\n" "fmla z21.s, z4.s, z3.s[1]\n" "fmla z22.s, z4.s, z3.s[2]\n" "fmla z23.s, z4.s, z3.s[3]\n" @@ -275,8 +276,8 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z13.s, z5.s, z2.s[1]\n" "fmla z14.s, z5.s, z2.s[2]\n" "fmla z15.s, z5.s, z2.s[3]\n" - "fmla z24.s, z5.s, z3.s[0]\n" "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" + "fmla z24.s, z5.s, z3.s[0]\n" "fmla z25.s, z5.s, z3.s[1]\n" "fmla z26.s, z5.s, z3.s[2]\n" "fmla z27.s, z5.s, z3.s[3]\n" @@ -284,13 +285,13 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z17.s, z6.s, z2.s[1]\n" "fmla z18.s, z6.s, z2.s[2]\n" "fmla z19.s, z6.s, z2.s[3]\n" - "fmla z28.s, z6.s, z3.s[0]\n" "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" + "fmla z28.s, z6.s, z3.s[0]\n" "fmla z29.s, z6.s, z3.s[1]\n" "fmla z30.s, z6.s, z3.s[2]\n" + "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z31.s, z6.s, z3.s[3]\n" "4:\n" - "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp new file mode 100644 index 0000000000..4ca43cd5c9 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +#include "../std_transforms_sve.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void sve_interleaved_fp32_mmla_3VLx8(const float *, const float *, float *, int, int, int); + +class interleaved_fp32_mmla_3VLx8 { +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, const float *, float *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return get_vector_length() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 2; + } + + // Use the standard fixed size transforms. + StdTransformsSVE transforms = {}; + + kern_type kernel=sve_interleaved_fp32_mmla_3VLx8; + + interleaved_fp32_mmla_3VLx8(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp new file mode 100644 index 0000000000..a404ae9c82 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_fp32_mmla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + K /= 2; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); long loops = loops_count; @@ -82,7 +93,7 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" @@ -235,46 +246,46 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "b.ne 2b\n" "1:\n" "zip1 z12.h, z13.h, z14.h\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip2 z13.h, z13.h, z14.h\n" "cbz %[regs], 3f\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip1 z14.h, z15.h, z8.h\n" "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip2 z15.h, z15.h, z8.h\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" + "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z8.h, z9.h, z10.h\n" "ld1h z13.h, p4/z, [%[b_ptr0]]\n" "zip2 z9.h, z9.h, z10.h\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip1 z10.h, z11.h, z12.h\n" - "ld1h z14.h, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z11.h, z11.h, z12.h\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z14.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" - "zip1 z12.h, z13.h, z14.h\n" + ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z12.h, z13.h, z14.h\n" + "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip2 z13.h, z13.h, z14.h\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" "zip2 z15.h, z15.h, z8.h\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - "ld1h z9.h, p4/z, [%[b_ptr0]]\n" ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" @@ -452,42 +463,43 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "b 7f\n" "3:\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip2 z15.h, z15.h, z8.h\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - "ld1h z10.h, p4/z, [%[b_ptr1]]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" + "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z8.h, z9.h, z10.h\n" "ld1h z13.h, p4/z, [%[b_ptr0]]\n" "zip2 z9.h, z9.h, z10.h\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip1 z10.h, z11.h, z12.h\n" - "ld1h z14.h, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z11.h, z11.h, z12.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" - "zip1 z12.h, z13.h, z14.h\n" - "zip2 z13.h, z13.h, z14.h\n" + "ld1h z14.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + "zip1 z12.h, z13.h, z14.h\n" + "zip2 z13.h, z13.h, z14.h\n" "zip1 z14.h, z15.h, z8.h\n" "zip2 z15.h, z15.h, z8.h\n" + ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" + ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" @@ -666,37 +678,37 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "zip2 z11.h, z11.h, z12.h\n" "ld1h z13.h, p4/z, [%[b_ptr0]]\n" "ld1h z14.h, p4/z, [%[b_ptr1]]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "mov z23.d, z19.d\n" "cbz %[loops], 1f\n" "2:\n" "zip1 z12.h, z13.h, z14.h\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip2 z13.h, z13.h, z14.h\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "subs %[loops], %[loops], #0x1\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[loops], %[loops], #0x1\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "ld1h z9.h, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip2 z15.h, z15.h, z8.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z8.h, z9.h, z10.h\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip2 z9.h, z9.h, z10.h\n" + ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" @@ -820,26 +832,26 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "ld1h z14.h, p4/z, [%[b_ptr1]]\n" ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "b.ne 2b\n" "1:\n" "zip1 z12.h, z13.h, z14.h\n" "zip2 z13.h, z13.h, z14.h\n" "cbz %[regs], 3f\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip1 z14.h, z15.h, z8.h\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.h, z15.h, z8.h\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" @@ -1103,28 +1115,29 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "b 7f\n" "3:\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" + "ld1rqh z5.h, p6/z, [a_ptr1]\n" + ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip2 z15.h, z15.h, z8.h\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip1 z8.h, z9.h, z10.h\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip2 z9.h, z9.h, z10.h\n" + "addvl a_ptr1, a_ptr1, #1\n" ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" @@ -1386,34 +1399,34 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "mov z27.d, z19.d\n" "ld1h z13.h, p4/z, [%[b_ptr0]]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "ld1h z14.h, p4/z, [%[b_ptr1]]\n" "cbz %[loops], 1f\n" "2:\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip1 z12.h, z13.h, z14.h\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" "zip2 z13.h, z13.h, z14.h\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "subs %[loops], %[loops], #0x1\n" + "ld1rqh z6.h, p7/z, [a_ptr2]\n" ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[loops], %[loops], #0x1\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.h, z15.h, z8.h\n" - "add a_ptr1, a_ptr1, #0x20\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" + "add a_ptr1, a_ptr1, #0x20\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" @@ -1576,28 +1589,28 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n" ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "b.ne 2b\n" "1:\n" "zip1 z12.h, z13.h, z14.h\n" "zip2 z13.h, z13.h, z14.h\n" "cbz %[regs], 3f\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1rqh z6.h, p7/z, [a_ptr2]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip1 z14.h, z15.h, z8.h\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.h, z15.h, z8.h\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" @@ -1922,35 +1935,36 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "b 7f\n" "3:\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" + "ld1rqh z5.h, p6/z, [a_ptr1]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" + "ld1rqh z6.h, p6/z, [a_ptr2]\n" + ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" - "zip2 z15.h, z15.h, z8.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" + "zip2 z15.h, z15.h, z8.h\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" + "addvl a_ptr2, a_ptr2, #1\n" ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" - "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip1 z8.h, z9.h, z10.h\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip2 z9.h, z9.h, z10.h\n" + ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" + "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n" "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" @@ -2276,7 +2290,6 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "mov z31.d, z19.d\n" "ld1h z13.h, p4/z, [%[b_ptr0]]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "ld1h z14.h, p4/z, [%[b_ptr1]]\n" "zip1 z12.h, z13.h, z14.h\n" @@ -2284,38 +2297,39 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "cbz %[loops], 1f\n" "2:\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" + "ld1rqh z6.h, p7/z, [a_ptr2]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1rqh z7.h, p7/z, [a_ptr3]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "subs %[loops], %[loops], #0x1\n" + "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" - "zip2 z15.h, z15.h, z8.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" + "zip2 z15.h, z15.h, z8.h\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" + "add a_ptr1, a_ptr1, #0x20\n" ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" - "add a_ptr3, a_ptr3, #0x20\n" + "add a_ptr2, a_ptr2, #0x20\n" ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" "zip1 z8.h, z9.h, z10.h\n" "zip2 z9.h, z9.h, z10.h\n" + ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n" "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" @@ -2503,28 +2517,28 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "zip1 z12.h, z13.h, z14.h\n" "zip2 z13.h, z13.h, z14.h\n" ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "b.ne 2b\n" "1:\n" "cbz %[regs], 3f\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" + "ld1rqh z6.h, p7/z, [a_ptr2]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1rqh z7.h, p7/z, [a_ptr3]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" - "zip1 z14.h, z15.h, z8.h\n" - "zip2 z15.h, z15.h, z8.h\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" + "zip1 z14.h, z15.h, z8.h\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.h, z15.h, z8.h\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" @@ -2910,30 +2924,31 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "b 7f\n" "3:\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" + "ld1rqh z5.h, p6/z, [a_ptr1]\n" ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z7.h, p6/z, [a_ptr3]\n" + "ld1rqh z6.h, p6/z, [a_ptr2]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" + "ld1rqh z7.h, p6/z, [a_ptr3]\n" + ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.h, z15.h, z8.h\n" "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" + "addvl a_ptr2, a_ptr2, #1\n" ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" + "addvl a_ptr3, a_ptr3, #1\n" ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp index 741f200d25..665e8656d2 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,9 @@ #ifdef __ARM_FEATURE_SVE + + + namespace arm_gemm { @@ -75,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_native_fp16_mla_4VLx4; - native_fp16_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + native_fp16_mla_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp index 14dd38bd25..dd33c785cf 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -60,12 +60,23 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld break; } - for (int y=0; y 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>())); long loops = loops_count; @@ -78,7 +89,7 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld const unsigned long ldcb = ldc * sizeof(__fp16); const __fp16 *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" @@ -256,88 +267,87 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" "fmla z18.h, z10.h, z0.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z19.h, z11.h, z0.h[0]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z16.h, z12.h, z0.h[1]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z17.h, z13.h, z0.h[1]\n" - "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z18.h, z14.h, z0.h[1]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z19.h, z15.h, z0.h[1]\n" - "ld1h z12.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z8.h, z0.h[2]\n" - "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "fmla z17.h, z9.h, z0.h[2]\n" - "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.h, z10.h, z0.h[2]\n" - "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.h, z11.h, z0.h[2]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z0.h[3]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z13.h, z0.h[3]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z14.h, z0.h[3]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z15.h, z0.h[3]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z17.h, z9.h, z0.h[4]\n" + "fmla z16.h, z8.h, z0.h[4]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[4]\n" + "fmla z17.h, z9.h, z0.h[4]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" + "fmla z18.h, z10.h, z0.h[4]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.h, z11.h, z0.h[4]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z12.h, z0.h[5]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z13.h, z0.h[5]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z14.h, z0.h[5]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z15.h, z0.h[5]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z17.h, z9.h, z0.h[6]\n" + "fmla z16.h, z8.h, z0.h[6]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[6]\n" + "fmla z17.h, z9.h, z0.h[6]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[6]\n" + "fmla z18.h, z10.h, z0.h[6]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.h, z11.h, z0.h[6]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z12.h, z0.h[7]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z13.h, z0.h[7]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z14.h, z0.h[7]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z15.h, z0.h[7]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z17.h, z9.h, z4.h[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z18.h, z10.h, z4.h[0]\n" + "fmla z16.h, z8.h, z4.h[0]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" - "fmla z19.h, z11.h, z4.h[0]\n" + "fmla z17.h, z9.h, z4.h[0]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.h, z10.h, z4.h[0]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "fmla z16.h, z12.h, z4.h[1]\n" + "fmla z19.h, z11.h, z4.h[0]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.h, z12.h, z4.h[1]\n" + "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" "fmla z17.h, z13.h, z4.h[1]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z18.h, z14.h, z4.h[1]\n" @@ -345,51 +355,52 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "fmla z19.h, z15.h, z4.h[1]\n" "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + "fmla z16.h, z8.h, z4.h[2]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.h, z9.h, z4.h[2]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z16.h, z8.h, z4.h[2]\n" + "fmla z18.h, z10.h, z4.h[2]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[2]\n" + "fmla z19.h, z11.h, z4.h[2]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[2]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[2]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z17.h, z13.h, z4.h[3]\n" + "fmla z16.h, z12.h, z4.h[3]\n" "ld1h z8.h, p0/z, [%[b_ptr0]]\n" - "fmla z18.h, z14.h, z4.h[3]\n" + "fmla z17.h, z13.h, z4.h[3]\n" "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[3]\n" + "fmla z18.h, z14.h, z4.h[3]\n" "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.h, z15.h, z4.h[3]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z8.h, z4.h[4]\n" - "ld1h z12.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z9.h, z4.h[4]\n" - "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z10.h, z4.h[4]\n" - "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z11.h, z4.h[4]\n" + "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z17.h, z13.h, z4.h[5]\n" + "fmla z16.h, z12.h, z4.h[5]\n" "ld1h z8.h, p0/z, [%[b_ptr0]]\n" - "fmla z18.h, z14.h, z4.h[5]\n" + "fmla z17.h, z13.h, z4.h[5]\n" "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[5]\n" + "fmla z18.h, z14.h, z4.h[5]\n" "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.h, z15.h, z4.h[5]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z8.h, z4.h[6]\n" - "ld1h z12.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z9.h, z4.h[6]\n" - "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z10.h, z4.h[6]\n" - "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z11.h, z4.h[6]\n" + "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z4.h[7]\n" "fmla z17.h, z13.h, z4.h[7]\n" @@ -474,66 +485,67 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "b 4f\n" "3:\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" "fmla z18.h, z10.h, z0.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z19.h, z11.h, z0.h[0]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z16.h, z12.h, z0.h[1]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z17.h, z13.h, z0.h[1]\n" - "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z18.h, z14.h, z0.h[1]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z19.h, z15.h, z0.h[1]\n" - "ld1h z12.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z8.h, z0.h[2]\n" - "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "fmla z17.h, z9.h, z0.h[2]\n" - "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.h, z10.h, z0.h[2]\n" - "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.h, z11.h, z0.h[2]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z0.h[3]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z13.h, z0.h[3]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z14.h, z0.h[3]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z15.h, z0.h[3]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z17.h, z9.h, z0.h[4]\n" + "fmla z16.h, z8.h, z0.h[4]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[4]\n" + "fmla z17.h, z9.h, z0.h[4]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" + "fmla z18.h, z10.h, z0.h[4]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.h, z11.h, z0.h[4]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z12.h, z0.h[5]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z13.h, z0.h[5]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z14.h, z0.h[5]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z15.h, z0.h[5]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z17.h, z9.h, z0.h[6]\n" + "fmla z16.h, z8.h, z0.h[6]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[6]\n" + "fmla z17.h, z9.h, z0.h[6]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[6]\n" + "fmla z18.h, z10.h, z0.h[6]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.h, z11.h, z0.h[6]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z16.h, z12.h, z0.h[7]\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z17.h, z13.h, z0.h[7]\n" "fmla z18.h, z14.h, z0.h[7]\n" "fmla z19.h, z15.h, z0.h[7]\n" @@ -888,21 +900,21 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "fmla z23.h, z15.h, z5.h[7]\n" "b.ne 2b\n" "1:\n" - "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z18.h, z10.h, z0.h[0]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z22.h, z10.h, z1.h[0]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z11.h, z0.h[0]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z23.h, z11.h, z1.h[0]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z0.h[1]\n" @@ -1201,19 +1213,19 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "b 4f\n" "3:\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z5.h, p6/z, [a_ptr1]\n" "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z18.h, z10.h, z0.h[0]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z22.h, z10.h, z1.h[0]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z11.h, z0.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z23.h, z11.h, z1.h[0]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z0.h[1]\n" @@ -1221,10 +1233,11 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "fmla z20.h, z12.h, z1.h[1]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "fmla z17.h, z13.h, z0.h[1]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z21.h, z13.h, z1.h[1]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.h, z14.h, z0.h[1]\n" + "addvl a_ptr1, a_ptr1, #1\n" "fmla z22.h, z14.h, z1.h[1]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.h, z15.h, z0.h[1]\n" @@ -1509,9 +1522,9 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "fmla z27.h, z11.h, z2.h[0]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z0.h[1]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla z20.h, z12.h, z1.h[1]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.h, z12.h, z1.h[1]\n" + "add a_ptr2, a_ptr2, #0x20\n" "fmla z24.h, z12.h, z2.h[1]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "fmla z17.h, z13.h, z0.h[1]\n" @@ -1768,21 +1781,21 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "fmla z27.h, z15.h, z6.h[7]\n" "b.ne 2b\n" "1:\n" - "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" "fmla z24.h, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z6.h, p7/z, [a_ptr2]\n" "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.h, z9.h, z2.h[0]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z10.h, z0.h[0]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z22.h, z10.h, z1.h[0]\n" "fmla z26.h, z10.h, z2.h[0]\n" "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" @@ -2176,26 +2189,27 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "b 4f\n" "3:\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" "fmla z24.h, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" + "ld1rqh z5.h, p6/z, [a_ptr1]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z6.h, p6/z, [a_ptr2]\n" "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.h, z9.h, z2.h[0]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z10.h, z0.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z22.h, z10.h, z1.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z26.h, z10.h, z2.h[0]\n" "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.h, z11.h, z0.h[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" "fmla z23.h, z11.h, z1.h[0]\n" + "addvl a_ptr2, a_ptr2, #1\n" "fmla z27.h, z11.h, z2.h[0]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z0.h[1]\n" @@ -2897,21 +2911,21 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "fmla z31.h, z15.h, z7.h[7]\n" "b.ne 2b\n" "1:\n" - "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" "fmla z24.h, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" "fmla z28.h, z8.h, z3.h[0]\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" + "ld1rqh z6.h, p7/z, [a_ptr2]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z7.h, p7/z, [a_ptr3]\n" "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.h, z9.h, z2.h[0]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z29.h, z9.h, z3.h[0]\n" "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.h, z10.h, z0.h[0]\n" @@ -3400,30 +3414,31 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "b 4f\n" "3:\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" "fmla z24.h, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" + "ld1rqh z5.h, p6/z, [a_ptr1]\n" "fmla z28.h, z8.h, z3.h[0]\n" - "ld1rqh z7.h, p6/z, [a_ptr3]\n" + "ld1rqh z6.h, p6/z, [a_ptr2]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z7.h, p6/z, [a_ptr3]\n" "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.h, z9.h, z2.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z29.h, z9.h, z3.h[0]\n" "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.h, z10.h, z0.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z22.h, z10.h, z1.h[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" "fmla z26.h, z10.h, z2.h[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" + "addvl a_ptr2, a_ptr2, #1\n" "fmla z30.h, z10.h, z3.h[0]\n" "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.h, z11.h, z0.h[0]\n" + "addvl a_ptr3, a_ptr3, #1\n" "fmla z23.h, z11.h, z1.h[0]\n" "fmla z27.h, z11.h, z2.h[0]\n" "fmla z31.h, z11.h, z3.h[0]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp index 19e5fbd974..0abde56af1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,9 @@ #ifdef __ARM_FEATURE_SVE + + + namespace arm_gemm { @@ -75,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_native_fp32_mla_4VLx4; - native_fp32_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + native_fp32_mla_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp index 3fc0e5fa36..b05906e199 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -60,12 +60,23 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, break; } - for (int y=0; y 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); long loops = loops_count; @@ -78,7 +89,7 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.s, %[temp], %[leftovers]\n" @@ -184,52 +195,51 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "ld1w z12.s, p0/z, [%[b_ptr0]]\n" "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" "fmla z18.s, z10.s, z0.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z19.s, z11.s, z0.s[0]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z16.s, z12.s, z0.s[1]\n" - "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z17.s, z13.s, z0.s[1]\n" - "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z18.s, z14.s, z0.s[1]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z19.s, z15.s, z0.s[1]\n" - "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.s, z8.s, z0.s[2]\n" - "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" "fmla z17.s, z9.s, z0.s[2]\n" - "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.s, z10.s, z0.s[2]\n" - "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.s, z11.s, z0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z0.s[3]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.s, z13.s, z0.s[3]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z18.s, z14.s, z0.s[3]\n" - "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.s, z15.s, z0.s[3]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z17.s, z9.s, z4.s[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z16.s, z8.s, z4.s[0]\n" "ld1w z12.s, p0/z, [%[b_ptr0]]\n" - "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z19.s, z11.s, z4.s[0]\n" "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" "fmla z17.s, z13.s, z4.s[1]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z18.s, z14.s, z4.s[1]\n" @@ -237,15 +247,16 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "fmla z19.s, z15.s, z4.s[1]\n" "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + "fmla z16.s, z8.s, z4.s[2]\n" "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.s, z9.s, z4.s[2]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" "ld1w z12.s, p0/z, [%[b_ptr0]]\n" - "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[2]\n" "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[2]\n" "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z4.s[3]\n" "fmla z17.s, z13.s, z4.s[3]\n" @@ -286,30 +297,31 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "b 4f\n" "3:\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" "fmla z18.s, z10.s, z0.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z19.s, z11.s, z0.s[0]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z16.s, z12.s, z0.s[1]\n" - "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z17.s, z13.s, z0.s[1]\n" - "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z18.s, z14.s, z0.s[1]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z19.s, z15.s, z0.s[1]\n" - "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.s, z8.s, z0.s[2]\n" - "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" "fmla z17.s, z9.s, z0.s[2]\n" - "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.s, z10.s, z0.s[2]\n" - "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.s, z11.s, z0.s[2]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z0.s[3]\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z17.s, z13.s, z0.s[3]\n" "fmla z18.s, z14.s, z0.s[3]\n" "fmla z19.s, z15.s, z0.s[3]\n" @@ -516,21 +528,21 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "fmla z23.s, z15.s, z5.s[3]\n" "b.ne 2b\n" "1:\n" - "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z18.s, z10.s, z0.s[0]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z22.s, z10.s, z1.s[0]\n" - "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z23.s, z11.s, z1.s[0]\n" "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z0.s[1]\n" @@ -665,19 +677,19 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "b 4f\n" "3:\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z18.s, z10.s, z0.s[0]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z22.s, z10.s, z1.s[0]\n" - "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.s, z11.s, z0.s[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z23.s, z11.s, z1.s[0]\n" "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z0.s[1]\n" @@ -685,10 +697,11 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "fmla z20.s, z12.s, z1.s[1]\n" "ld1w z12.s, p0/z, [%[b_ptr0]]\n" "fmla z17.s, z13.s, z0.s[1]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z21.s, z13.s, z1.s[1]\n" "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.s, z14.s, z0.s[1]\n" + "addvl a_ptr1, a_ptr1, #1\n" "fmla z22.s, z14.s, z1.s[1]\n" "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.s, z15.s, z0.s[1]\n" @@ -861,9 +874,9 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "fmla z27.s, z11.s, z2.s[0]\n" "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z0.s[1]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla z20.s, z12.s, z1.s[1]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "add a_ptr2, a_ptr2, #0x20\n" "fmla z24.s, z12.s, z2.s[1]\n" "ld1w z12.s, p0/z, [%[b_ptr0]]\n" "fmla z17.s, z13.s, z0.s[1]\n" @@ -984,21 +997,21 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "fmla z27.s, z15.s, z6.s[3]\n" "b.ne 2b\n" "1:\n" - "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" "fmla z24.s, z8.s, z2.s[0]\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.s, z9.s, z2.s[0]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z18.s, z10.s, z0.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z22.s, z10.s, z1.s[0]\n" "fmla z26.s, z10.s, z2.s[0]\n" "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" @@ -1180,26 +1193,27 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "b 4f\n" "3:\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" "fmla z24.s, z8.s, z2.s[0]\n" - "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.s, z9.s, z2.s[0]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z18.s, z10.s, z0.s[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z22.s, z10.s, z1.s[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z26.s, z10.s, z2.s[0]\n" "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.s, z11.s, z0.s[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" "fmla z23.s, z11.s, z1.s[0]\n" + "addvl a_ptr2, a_ptr2, #1\n" "fmla z27.s, z11.s, z2.s[0]\n" "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z0.s[1]\n" @@ -1589,21 +1603,21 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "fmla z31.s, z15.s, z7.s[3]\n" "b.ne 2b\n" "1:\n" - "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" "fmla z24.s, z8.s, z2.s[0]\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" "fmla z28.s, z8.s, z3.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.s, z9.s, z2.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z29.s, z9.s, z3.s[0]\n" "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.s, z10.s, z0.s[0]\n" @@ -1832,30 +1846,31 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "b 4f\n" "3:\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" "fmla z24.s, z8.s, z2.s[0]\n" - "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" "fmla z28.s, z8.s, z3.s[0]\n" - "ld1rqw z7.s, p6/z, [a_ptr3]\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.s, z9.s, z2.s[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z29.s, z9.s, z3.s[0]\n" "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.s, z10.s, z0.s[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z22.s, z10.s, z1.s[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" "fmla z26.s, z10.s, z2.s[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" + "addvl a_ptr2, a_ptr2, #1\n" "fmla z30.s, z10.s, z3.s[0]\n" "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.s, z11.s, z0.s[0]\n" + "addvl a_ptr3, a_ptr3, #1\n" "fmla z23.s, z11.s, z1.s[0]\n" "fmla z27.s, z11.s, z2.s[0]\n" "fmla z31.s, z11.s, z3.s[0]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp index 1b9d1312b5..40a69b54ff 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_native_s8s32_dot_4VLx4; - native_s8s32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + native_s8s32_dot_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp index 26736f597a..7c5d4dc280 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp @@ -32,7 +32,7 @@ namespace arm_gemm { -void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int ldb, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool append) { +void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int ldb, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool append) { const long loops_count = ((K + 16) / 32) - 1; K -= loops_count * 32; const long regs_count = (K / 16) - 1; @@ -41,12 +41,23 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l const long blocks_count = K / 4; const long odds_count = K - (blocks_count * 4); - for (int y=0; y 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); long loops = loops_count; @@ -62,7 +73,7 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l long ldbb = ldb * sizeof(int8_t) * 4; const unsigned long ldcb = ldc * sizeof(int32_t); - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "mov z16.s, #0\n" @@ -270,22 +281,22 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "ld1b z12.b, p4/z, [%[b_ptr0]]\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z14.b, z12.b\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "sdot z19.s, z11.b, z0.b[0]\n" - "ld1b z10.b, p4/z, [%[b_ptr1]]\n" - "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip2 z13.b, z13.b, z14.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" @@ -635,33 +646,34 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "b 7f\n" "3:\n" "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" - "zip2 z8.b, z14.b, z12.b\n" + "sdot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "zip1 z14.b, z14.b, z12.b\n" + "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "sdot z17.s, z9.b, z0.b[0]\n" + "zip1 z14.b, z14.b, z12.b\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip2 z13.b, z13.b, z14.b\n" - "add %[b_ptr3], %[b_ptr3], %[ldb]\n" - "zip1 z14.b, z15.b, z8.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z16.s, z12.b, z0.b[1]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" "zip2 z11.b, z8.b, z9.b\n" "addvl %[a_ptr0], %[a_ptr0], #1\n" "zip1 z9.b, z8.b, z9.b\n" "ld1b z8.b, p4/z, [%[b_ptr3]]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "ld1b z13.b, p4/z, [%[b_ptr2]]\n" "sdot z18.s, z14.b, z0.b[1]\n" "ld1b z14.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z15.b, z0.b[1]\n" @@ -998,11 +1010,11 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "sdot z21.s, z9.b, z1.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "sdot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "sdot z22.s, z10.b, z1.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z23.s, z11.b, z1.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z11.b, z8.b, z9.b\n" @@ -1176,34 +1188,34 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "1:\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "zip2 z8.b, z14.b, z12.b\n" + "sdot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "zip1 z14.b, z14.b, z12.b\n" + "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "sdot z17.s, z9.b, z0.b[0]\n" + "zip1 z14.b, z14.b, z12.b\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z23.s, z11.b, z1.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z11.b, z8.b, z9.b\n" "zip1 z9.b, z8.b, z9.b\n" "ld1b z8.b, p4/z, [%[b_ptr3]]\n" @@ -1604,34 +1616,35 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "b 7f\n" "3:\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" "ld1rqb z5.b, p6/z, [a_ptr1]\n" - "zip2 z8.b, z14.b, z12.b\n" + "sdot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "zip1 z14.b, z14.b, z12.b\n" + "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "sdot z17.s, z9.b, z0.b[0]\n" + "zip1 z14.b, z14.b, z12.b\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" - "addvl a_ptr1, a_ptr1, #1\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z23.s, z11.b, z1.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "zip2 z11.b, z8.b, z9.b\n" + "addvl a_ptr1, a_ptr1, #1\n" "zip1 z9.b, z8.b, z9.b\n" "ld1b z8.b, p4/z, [%[b_ptr3]]\n" "sdot z16.s, z12.b, z0.b[1]\n" @@ -2242,19 +2255,20 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "1:\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z17.s, z9.b, z0.b[0]\n" "ld1rqb z6.b, p7/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" @@ -2262,13 +2276,12 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "zip1 z14.b, z15.b, z8.b\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z25.s, z9.b, z2.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "sdot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "sdot z26.s, z10.b, z2.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z11.b, z0.b[0]\n" @@ -2733,16 +2746,18 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "b 7f\n" "3:\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "sdot z17.s, z9.b, z0.b[0]\n" "ld1rqb z6.b, p6/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" @@ -2752,19 +2767,18 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "addvl a_ptr1, a_ptr1, #1\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z25.s, z9.b, z2.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "sdot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z22.s, z10.b, z1.b[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "sdot z26.s, z10.b, z2.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z11.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "sdot z23.s, z11.b, z1.b[0]\n" + "addvl a_ptr2, a_ptr2, #1\n" "sdot z27.s, z11.b, z2.b[0]\n" "zip2 z11.b, z8.b, z9.b\n" "zip1 z9.b, z8.b, z9.b\n" @@ -3469,25 +3483,25 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "1:\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" "sdot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" "sdot z28.s, z8.b, z3.b[0]\n" - "ld1rqb z7.b, p7/z, [a_ptr3]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqb z7.b, p7/z, [a_ptr3]\n" "zip1 z14.b, z14.b, z12.b\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z17.s, z9.b, z0.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" - "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" @@ -4023,38 +4037,39 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "b 7f\n" "3:\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" "sdot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z6.b, p6/z, [a_ptr2]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" "sdot z28.s, z8.b, z3.b[0]\n" - "ld1rqb z7.b, p6/z, [a_ptr3]\n" + "ld1rqb z6.b, p6/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqb z7.b, p6/z, [a_ptr3]\n" "zip1 z14.b, z14.b, z12.b\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z17.s, z9.b, z0.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" - "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" "sdot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z25.s, z9.b, z2.b[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" "sdot z29.s, z9.b, z3.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "sdot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "sdot z22.s, z10.b, z1.b[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" + "addvl a_ptr2, a_ptr2, #1\n" "sdot z26.s, z10.b, z2.b[0]\n" + "addvl a_ptr3, a_ptr3, #1\n" "sdot z30.s, z10.b, z3.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z11.b, z0.b[0]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp index 33e3ac6c23..043fa7484a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_native_u8u32_dot_4VLx4; - native_u8u32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + native_u8u32_dot_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp index 639ca5765c..bbc1092e4e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp @@ -32,7 +32,7 @@ namespace arm_gemm { -void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int ldb, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool append) { +void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int ldb, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool append) { const long loops_count = ((K + 16) / 32) - 1; K -= loops_count * 32; const long regs_count = (K / 16) - 1; @@ -41,12 +41,23 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int const long blocks_count = K / 4; const long odds_count = K - (blocks_count * 4); - for (int y=0; y 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); long loops = loops_count; @@ -62,7 +73,7 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int long ldbb = ldb * sizeof(uint8_t) * 4; const unsigned long ldcb = ldc * sizeof(uint32_t); - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "mov z16.s, #0\n" @@ -270,22 +281,22 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "ld1b z12.b, p4/z, [%[b_ptr0]]\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z14.b, z12.b\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "udot z19.s, z11.b, z0.b[0]\n" - "ld1b z10.b, p4/z, [%[b_ptr1]]\n" - "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip2 z13.b, z13.b, z14.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" @@ -635,33 +646,34 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "b 7f\n" "3:\n" "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z17.s, z9.b, z0.b[0]\n" "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" - "zip2 z8.b, z14.b, z12.b\n" + "udot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "zip1 z14.b, z14.b, z12.b\n" + "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "udot z17.s, z9.b, z0.b[0]\n" + "zip1 z14.b, z14.b, z12.b\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "udot z18.s, z10.b, z0.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip2 z13.b, z13.b, z14.b\n" - "add %[b_ptr3], %[b_ptr3], %[ldb]\n" - "zip1 z14.b, z15.b, z8.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z16.s, z12.b, z0.b[1]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" "zip2 z11.b, z8.b, z9.b\n" "addvl %[a_ptr0], %[a_ptr0], #1\n" "zip1 z9.b, z8.b, z9.b\n" "ld1b z8.b, p4/z, [%[b_ptr3]]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "ld1b z13.b, p4/z, [%[b_ptr2]]\n" "udot z18.s, z14.b, z0.b[1]\n" "ld1b z14.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z15.b, z0.b[1]\n" @@ -998,11 +1010,11 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "udot z21.s, z9.b, z1.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "udot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "udot z22.s, z10.b, z1.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z23.s, z11.b, z1.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z11.b, z8.b, z9.b\n" @@ -1176,34 +1188,34 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "1:\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z17.s, z9.b, z0.b[0]\n" "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "zip2 z8.b, z14.b, z12.b\n" + "udot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "zip1 z14.b, z14.b, z12.b\n" + "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "udot z17.s, z9.b, z0.b[0]\n" + "zip1 z14.b, z14.b, z12.b\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "udot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z23.s, z11.b, z1.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z11.b, z8.b, z9.b\n" "zip1 z9.b, z8.b, z9.b\n" "ld1b z8.b, p4/z, [%[b_ptr3]]\n" @@ -1604,34 +1616,35 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "b 7f\n" "3:\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "udot z17.s, z9.b, z0.b[0]\n" "ld1rqb z5.b, p6/z, [a_ptr1]\n" - "zip2 z8.b, z14.b, z12.b\n" + "udot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "zip1 z14.b, z14.b, z12.b\n" + "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "udot z17.s, z9.b, z0.b[0]\n" + "zip1 z14.b, z14.b, z12.b\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" - "addvl a_ptr1, a_ptr1, #1\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "udot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z23.s, z11.b, z1.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "zip2 z11.b, z8.b, z9.b\n" + "addvl a_ptr1, a_ptr1, #1\n" "zip1 z9.b, z8.b, z9.b\n" "ld1b z8.b, p4/z, [%[b_ptr3]]\n" "udot z16.s, z12.b, z0.b[1]\n" @@ -2242,19 +2255,20 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "1:\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z17.s, z9.b, z0.b[0]\n" "ld1rqb z6.b, p7/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" @@ -2262,13 +2276,12 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "zip1 z14.b, z15.b, z8.b\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z25.s, z9.b, z2.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "udot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "udot z26.s, z10.b, z2.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z11.b, z0.b[0]\n" @@ -2733,16 +2746,18 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "b 7f\n" "3:\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "udot z17.s, z9.b, z0.b[0]\n" "ld1rqb z6.b, p6/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" @@ -2752,19 +2767,18 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "addvl a_ptr1, a_ptr1, #1\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z25.s, z9.b, z2.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "udot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z22.s, z10.b, z1.b[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "udot z26.s, z10.b, z2.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z11.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "udot z23.s, z11.b, z1.b[0]\n" + "addvl a_ptr2, a_ptr2, #1\n" "udot z27.s, z11.b, z2.b[0]\n" "zip2 z11.b, z8.b, z9.b\n" "zip1 z9.b, z8.b, z9.b\n" @@ -3469,25 +3483,25 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "1:\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" "udot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" "udot z28.s, z8.b, z3.b[0]\n" - "ld1rqb z7.b, p7/z, [a_ptr3]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqb z7.b, p7/z, [a_ptr3]\n" "zip1 z14.b, z14.b, z12.b\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z17.s, z9.b, z0.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" - "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" @@ -4023,38 +4037,39 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "b 7f\n" "3:\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" "udot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z6.b, p6/z, [a_ptr2]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" "udot z28.s, z8.b, z3.b[0]\n" - "ld1rqb z7.b, p6/z, [a_ptr3]\n" + "ld1rqb z6.b, p6/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqb z7.b, p6/z, [a_ptr3]\n" "zip1 z14.b, z14.b, z12.b\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z17.s, z9.b, z0.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" - "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" "udot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z25.s, z9.b, z2.b[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" "udot z29.s, z9.b, z3.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "udot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "udot z22.s, z10.b, z1.b[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" + "addvl a_ptr2, a_ptr2, #1\n" "udot z26.s, z10.b, z2.b[0]\n" + "addvl a_ptr3, a_ptr3, #1\n" "udot z30.s, z10.b, z3.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z11.b, z0.b[0]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp index 9bee502236..6b070d6d71 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -77,7 +77,10 @@ public: // Default to the generic kernel kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx8; - smallK_hybrid_fp32_mla_1VLx8(const CPUInfo *ci) { UNUSED(ci); } + smallK_hybrid_fp32_mla_1VLx8(const CPUInfo *ci) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp index fc18cbdbbf..9bc0969bf2 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp @@ -77,7 +77,10 @@ public: // Default to the generic kernel kern_type kernel=sve_smallK_hybrid_s8s32_dot_1VLx8; - smallK_hybrid_s8s32_dot_1VLx8(const CPUInfo *ci) { UNUSED(ci); } + smallK_hybrid_s8s32_dot_1VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp index 51d3e736ed..cc27c13533 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp @@ -77,7 +77,10 @@ public: // Default to the generic kernel kern_type kernel=sve_smallK_hybrid_u8u32_dot_1VLx8; - smallK_hybrid_u8u32_dot_1VLx8(const CPUInfo *ci) { UNUSED(ci); } + smallK_hybrid_u8u32_dot_1VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp index 5d8eae4866..a81d4504ae 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -1130,11 +1130,7 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { - const __fp16 *biasptr = nullbias; - if (bias) - { - biasptr = bias + i; - } + const __fp16 *biasptr = bias ? bias + i : nullbias; switch(height) { diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp index 088353e5f3..284f2dc1a0 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,8 +30,8 @@ void MergeResults<12, 8, false>(float *out, const float *in, const int ldout, co { const float *inptr = in; float nullbias[12]; - float minval = - std::numeric_limits::infinity(); - float maxval = std::numeric_limits::infinity(); + float minval = - static_cast(std::numeric_limits::infinity()); + float maxval = static_cast(std::numeric_limits::infinity()); switch(act.type) { @@ -1106,11 +1106,7 @@ void MergeResults<12, 8, false>(float *out, const float *in, const int ldout, co } else { - const float *biasptr = nullbias; - if (bias) - { - biasptr = bias + i; - } + const float *biasptr = bias ? bias + i : nullbias; switch(height) { diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp index 2e45d8b5d1..fcf08e4e15 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,10 +26,8 @@ #ifdef __aarch64__ template<> -void MergeResults<12, 8, false>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation act, bool append) +void MergeResults<12, 8, false>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation , bool append) { - UNUSED(act); - const int32_t *inptr = in; int32_t nullbias[12]; @@ -862,11 +860,7 @@ void MergeResults<12, 8, false>(int32_t *out, const int32_t *in, const int ldout } else { - const int32_t *biasptr = nullbias; - if (bias) - { - biasptr = bias + i; - } + const int32_t *biasptr = bias ? bias + i : nullbias; switch(height) { diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp index 6d869af803..88eaa5f07c 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,10 +26,8 @@ #ifdef __aarch64__ template<> -void MergeResults<4, 4, false>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation act, bool append) +void MergeResults<4, 4, false>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation , bool append) { - UNUSED(act); - const int32_t *inptr = in; int32_t nullbias[4]; @@ -240,11 +238,7 @@ void MergeResults<4, 4, false>(int32_t *out, const int32_t *in, const int ldout, } else { - const int32_t *biasptr = nullbias; - if (bias) - { - biasptr = bias + i; - } + const int32_t *biasptr = bias ? bias + i : nullbias; switch(height) { diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp index 0a05944102..adc02f19eb 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,10 +26,8 @@ #ifdef __aarch64__ template<> -void MergeResults<12, 8, false>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation act, bool append) +void MergeResults<12, 8, false>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation , bool append) { - UNUSED(act); - const uint32_t *inptr = in; uint32_t nullbias[12]; @@ -862,11 +860,7 @@ void MergeResults<12, 8, false>(uint32_t *out, const uint32_t *in, const int ldo } else { - const uint32_t *biasptr = nullbias; - if (bias) - { - biasptr = bias + i; - } + const uint32_t *biasptr = bias ? bias + i : nullbias; switch(height) { diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp index efb17dc9e9..32e1eebaa4 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,10 +26,8 @@ #ifdef __aarch64__ template<> -void MergeResults<4, 4, false>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation act, bool append) +void MergeResults<4, 4, false>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation , bool append) { - UNUSED(act); - const uint32_t *inptr = in; uint32_t nullbias[4]; @@ -240,11 +238,7 @@ void MergeResults<4, 4, false>(uint32_t *out, const uint32_t *in, const int ldou } else { - const uint32_t *biasptr = nullbias; - if (bias) - { - biasptr = bias + i; - } + const uint32_t *biasptr = bias ? bias + i : nullbias; switch(height) { diff --git a/src/core/NEON/kernels/arm_gemm/merges/list.hpp b/src/core/NEON/kernels/arm_gemm/merges/list.hpp index 4edb497967..825c2fd020 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/list.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/list.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp index a44ef55a86..cf1d10329b 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -1010,11 +1010,7 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co } else { - const __fp16 *biasptr = nullbias; - if (bias) - { - biasptr = bias + i; - } + const __fp16 *biasptr = bias ? bias + i : nullbias; switch(height) { diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp index bb073e4868..b0d10c085d 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -1010,11 +1010,7 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons } else { - const float *biasptr = nullbias; - if (bias) - { - biasptr = bias + i; - } + const float *biasptr = bias ? bias + i : nullbias; switch(height) { diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp index d4c5073f8d..34b6fe3ef5 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,13 +26,12 @@ #ifdef __ARM_FEATURE_SVE template<> -void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation act, bool append) +void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation , bool append) { - UNUSED(act); - const int32_t *inptr = in; int32_t nullbias[192]; + if (!append && !bias) { memset(nullbias, 0, (3 * get_vector_length() * sizeof(int32_t))); @@ -765,11 +764,7 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout, } else { - const int32_t *biasptr = nullbias; - if (bias) - { - biasptr = bias + i; - } + const int32_t *biasptr = bias ? bias + i : nullbias; switch(height) { diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp index f2a28fa004..c4b2bb56d6 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,13 +26,12 @@ #ifdef __ARM_FEATURE_SVE template<> -void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation act, bool append) +void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation , bool append) { - UNUSED(act); - const uint32_t *inptr = in; uint32_t nullbias[192]; + if (!append && !bias) { memset(nullbias, 0, (3 * get_vector_length() * sizeof(uint32_t))); @@ -765,11 +764,7 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout } else { - const uint32_t *biasptr = nullbias; - if (bias) - { - biasptr = bias + i; - } + const uint32_t *biasptr = bias ? bias + i : nullbias; switch(height) { diff --git a/src/core/NEON/kernels/arm_gemm/misc.cpp b/src/core/NEON/kernels/arm_gemm/misc.cpp index 6758a88c65..a1892dc8d5 100644 --- a/src/core/NEON/kernels/arm_gemm/misc.cpp +++ b/src/core/NEON/kernels/arm_gemm/misc.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ + #ifndef NO_MULTI_THREADING #include #endif diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp index 995716575a..5c58c585d7 100644 --- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp +++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -149,7 +149,7 @@ public: } ndrange_t get_window_size() const override { - return _subgemm->get_window_size(); + return { _subgemm->get_window_size() }; } void set_nthreads(int nthreads) override { @@ -158,8 +158,7 @@ public: _args._maxthreads = nthreads; } - // Execute - void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { + void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) override { _subgemm->execute(work_range, thread_locator, threadid); if (!_args._pretransposed_hint) { col_sums_runtime(threadid); diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp index 53e5527a8d..fbf49c8a31 100644 --- a/src/core/NEON/kernels/arm_gemm/quantized.cpp +++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp @@ -24,6 +24,7 @@ #ifdef __aarch64__ #include "arm_gemm.hpp" +#include "utils.hpp" #include @@ -283,7 +284,6 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne v_mul0=v_mul; v_shf0=v_shift; } - // Load column pointers int32x4_t v_col0 = vld1q_s32(colptr); colptr += 4; @@ -604,7 +604,6 @@ namespace { * that the terms can simply be added in the requantize code. * */ switch (rows) { - default: case 1: /* If we only have one output, just use ADDV. Multiply * the offset into all four components separately so it @@ -646,6 +645,9 @@ namespace { vst1q_s32(row_bias, t0); break; + default: + UNREACHABLE("Impossible."); + } } @@ -836,7 +838,6 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h if (numcols==16) { switch(numrows) { - default: case 1: add_block<1>(input + row * in_stride + col, in_stride, col_bias + col); break; @@ -852,6 +853,9 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h case 4: add_block<4>(input + row * in_stride + col, in_stride, col_bias + col); break; + + default: + UNREACHABLE("Impossible."); } } else { for (; col void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0, - const int ymax, const int k0, const int kmax, bool transposed) { + const int ymax, const int k0, const int kmax, bool transposed) const { if (transposed) { Transform(out, in, stride, y0, ymax, k0, kmax); } else { @@ -55,7 +55,7 @@ public: template void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0, - const int xmax, const int k0, const int kmax, bool transposed) { + const int xmax, const int k0, const int kmax, bool transposed) const { if (transposed) { Transform(out, in, stride, x0, xmax, k0, kmax); } else { @@ -64,7 +64,7 @@ public: } template - void Merge(TOut *out, const TResult *in, int stride, int y0, int ymax, int x0, int xmax, const TOut *bias, const Activation act, bool append) { + void Merge(TOut *out, const TResult *in, int stride, int y0, int ymax, int x0, int xmax, const TOut *bias, const Activation act, bool append) const { MergeResults(out, in, stride, y0, ymax, x0, xmax, bias, act, append); } }; diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp index bdae90300b..c6ea079882 100644 --- a/src/core/NEON/kernels/arm_gemm/transform.hpp +++ b/src/core/NEON/kernels/arm_gemm/transform.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp index 543664bb0e..5e5f65183c 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -59,7 +59,6 @@ inline void TransformImpl<6, 1, false, 4, 4, false>::Transform(T *out, const T * /* 'first' forces this to always run at least once, needed if the total size is <=7. */ if ((y + 5) >= ymax) { switch ((y + 5) - ymax) { - /* Everything falls through in here */ case 4: inptr1 = zerobuff; // fall through diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp index 6b742c8776..9b6f4de543 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -38,8 +38,8 @@ void TransformImpl<4, 16, false, 1, 1, false>::Transform(T *out, const T *in, in uint8_t zerobuff[16] = { 0 }; - for (uint64_t y = y0 ; y < static_cast(ymax) ; y+=4) { - const uint8_t *inptr0 = inptr + y * ldin + k0; + for (int y=y0; y(y) * ldin + k0; const uint8_t *inptr1 = inptr0 + ldin; const uint8_t *inptr2 = inptr1 + ldin; const uint8_t *inptr3 = inptr2 + ldin; @@ -52,9 +52,8 @@ void TransformImpl<4, 16, false, 1, 1, false>::Transform(T *out, const T *in, in int x=(kmax-k0); for (;x>15;x-=16) { /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if ((y + 3) >= static_cast(ymax)) { + if ((y + 3) >= ymax) { switch ((y + 3) - ymax) { - /* Everything falls through in here */ case 2: inptr1 = zerobuff; // fall through @@ -90,9 +89,8 @@ void TransformImpl<4, 16, false, 1, 1, false>::Transform(T *out, const T *in, in if (x>0) { /* Need to duplicate this here, in case we didn't run the main loop. */ - if ((y + 3) >= static_cast(ymax)) { + if ((y + 3) >= ymax) { switch ((y + 3) - ymax) { - /* Everything falls through in here */ case 2: inptr1 = zerobuff; // fall through diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp index 80dd6c5e25..3d912c4675 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -63,7 +63,6 @@ void TransformImpl<8, 1, false, 2, 2, false>::Transform(T *out, const T *in, int /* 'first' forces this to always run at least once, needed if the total size is <=7. */ if ((y + 7) >= ymax) { switch ((y + 7) - ymax) { - /* Everything falls through in here */ case 6: inptr1 = zerobuff; // fall through diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp index 9dfc1346e6..701d688af2 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -63,7 +63,6 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T * /* 'first' forces this to always run at least once, needed if the total size is <=7. */ if ((y + 7) >= ymax) { switch ((y + 7) - ymax) { - /* Everything falls through in here */ case 6: inptr1 = zerobuff; // fall through diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp index 2bc7801b15..2546cc571a 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -60,7 +60,7 @@ inline void TransformImpl<8, 4, false, 1, 1, false>::Transform(T *out, const T * } }; - uint8_t zerobuff[64]; // 32 for asm loop plus up to 31 for overflow loop + uint8_t zerobuff[64] = { 0 }; // 32 for asm loop plus up to 31 for overflow loop for (int y=y0; y::Transform(T *out, const T * /* 'first' forces this to always run at least once, needed if the total size is <=32. */ if ((y + 7) >= ymax) { switch ((y + 7) - ymax) { - /* Everything falls through in here */ case 6: inptr1 = zerobuff; // fall through diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp index bde3274926..a342d6c3d1 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -63,7 +63,6 @@ inline void TransformImpl<8, 1, false, 4, 2, false>::Transform(float *out, const /* 'first' forces this to always run at least once, needed if the total size is <=7. */ if ((y + 7) >= ymax) { switch ((y + 7) - ymax) { - /* Everything falls through in here */ case 6: inptr1 = zerobuff; // fall through diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp index 8992c1010d..d7de9ff934 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp index 6d627334cd..a137f9360a 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp index be66cd42ff..2c698b2576 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,6 +34,7 @@ #include "a64_transpose_interleave_8way_32bit.hpp" #include "sve_interleave_8way_32bit.hpp" #include "sve_interleave_8way_block2_16bit.hpp" +#include "sve_interleave_8way_block2_32bit.hpp" #include "sve_interleave_8way_block4_16bit.hpp" #include "sve_interleave_8way_block4_8bit.hpp" #include "sve_interleave_8way_block8_8bit.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp index 881dc7bb72..348d78e3f5 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp index 4cc4311cee..f21933b8de 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,12 +36,12 @@ inline void TransformImpl<8, 2, false, 4, 4, false>::Transform(T *out, const T * { const int height = ymax-y; const long inwidth = (kmax - k0); - const long outwidth = (inwidth * 8 + 1) / 2; + const long outwidth = ((inwidth + 1) / 2) * 16; long inpos = 0; long outpos = 0; uint32_t *outptr = master_outptr; - master_outptr += (outwidth * 2); + master_outptr += outwidth; const uint32_t *inptr0 = inptr + y * ldin + k0; const uint32_t *inptr1 = inptr0 + ldin; @@ -60,571 +60,535 @@ inline void TransformImpl<8, 2, false, 4, 4, false>::Transform(T *out, const T * "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z4.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0]]\n" - "zip1 z8.d, z0.d, z4.d\n" + "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" "incw %[inpos], all, mul #1\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" + "zip1 z8.d, z0.d, z4.d\n" "zip2 z9.d, z0.d, z4.d\n" - "addvl %[inptr0], %[inptr0], #1\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip1 z0.d, z8.d, z4.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" "zip2 z1.d, z8.d, z4.d\n" - "incd %[outpos], all, mul #1\n" "zip1 z2.d, z9.d, z4.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" "zip2 z3.d, z9.d, z4.d\n" - "incd %[outpos], all, mul #1\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip1 z8.d, z0.d, z4.d\n" - "st1d z8.d, p0, [%[outptr]]\n" + "incw %[outpos], all, mul #1\n" "zip2 z9.d, z0.d, z4.d\n" - "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n" "zip1 z10.d, z1.d, z4.d\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z11.d, z1.d, z4.d\n" - "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" "zip1 z12.d, z2.d, z4.d\n" - "incd %[outpos], all, mul #1\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip2 z13.d, z2.d, z4.d\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" "zip1 z14.d, z3.d, z4.d\n" - "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" "zip2 z15.d, z3.d, z4.d\n" - "incd %[outpos], all, mul #1\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" - "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" - "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" - "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" - "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n" - "incd %[outpos], all, mul #1\n" + "whilelt p4.s, %[outpos], %[outwidth]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" + "incw %[outpos], all, mul #1\n" + "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" + "whilelt p5.s, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" + "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" + "whilelt p6.s, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" + "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" + "whilelt p7.s, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" + "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; - + case 2: __asm __volatile( "1:\n" "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z4.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0]]\n" + "mov z14.s, #0\n" + "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" + "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" + "incw %[inpos], all, mul #1\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip1 z8.d, z0.d, z4.d\n" - "ld1w z1.s, p0/z, [%[inptr1]]\n" + "incw %[outpos], all, mul #1\n" "zip2 z9.d, z0.d, z4.d\n" - "incw %[inpos], all, mul #1\n" "zip1 z10.d, z1.d, z4.d\n" - "addvl %[inptr0], %[inptr0], #1\n" "zip2 z11.d, z1.d, z4.d\n" - "addvl %[inptr1], %[inptr1], #1\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip1 z0.d, z8.d, z4.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" "zip2 z1.d, z8.d, z4.d\n" - "incd %[outpos], all, mul #1\n" "zip1 z2.d, z9.d, z4.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" "zip2 z3.d, z9.d, z4.d\n" - "incd %[outpos], all, mul #1\n" - "mov z14.s, #0\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip1 z4.d, z10.d, z14.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip2 z5.d, z10.d, z14.d\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" "zip1 z6.d, z11.d, z14.d\n" - "incd %[outpos], all, mul #1\n" "zip2 z7.d, z11.d, z14.d\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" "zip1 z8.d, z0.d, z4.d\n" - "st1d z8.d, p0, [%[outptr]]\n" + "incw %[outpos], all, mul #1\n" "zip2 z9.d, z0.d, z4.d\n" - "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n" "zip1 z10.d, z1.d, z5.d\n" - "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z11.d, z1.d, z5.d\n" - "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n" + "whilelt p4.s, %[outpos], %[outwidth]\n" "zip1 z12.d, z2.d, z6.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip2 z13.d, z2.d, z6.d\n" - "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n" + "incw %[outpos], all, mul #1\n" "zip1 z14.d, z3.d, z7.d\n" - "incd %[outpos], all, mul #1\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" "zip2 z15.d, z3.d, z7.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" - "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" - "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" - "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n" - "incd %[outpos], all, mul #1\n" + "whilelt p5.s, %[outpos], %[outwidth]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" + "incw %[outpos], all, mul #1\n" + "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" + "whilelt p6.s, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" + "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" + "whilelt p7.s, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" + "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" + "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; - + case 3: __asm __volatile( "1:\n" "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z4.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0]]\n" + "mov z14.s, #0\n" + "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" + "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" + "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n" + "incw %[inpos], all, mul #1\n" "zip1 z8.d, z0.d, z4.d\n" - "ld1w z1.s, p0/z, [%[inptr1]]\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z9.d, z0.d, z4.d\n" - "ld1w z2.s, p0/z, [%[inptr2]]\n" + "incw %[outpos], all, mul #1\n" "zip1 z10.d, z1.d, z4.d\n" - "incw %[inpos], all, mul #1\n" "zip2 z11.d, z1.d, z4.d\n" - "addvl %[inptr0], %[inptr0], #1\n" "zip1 z12.d, z2.d, z4.d\n" - "addvl %[inptr1], %[inptr1], #1\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip2 z13.d, z2.d, z4.d\n" - "addvl %[inptr2], %[inptr2], #1\n" + "incw %[outpos], all, mul #1\n" "zip1 z0.d, z8.d, z12.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" "zip2 z1.d, z8.d, z12.d\n" - "incd %[outpos], all, mul #1\n" "zip1 z2.d, z9.d, z13.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip2 z3.d, z9.d, z13.d\n" - "incd %[outpos], all, mul #1\n" - "mov z14.s, #0\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" "zip1 z4.d, z10.d, z14.d\n" - "incd %[outpos], all, mul #1\n" "zip2 z5.d, z10.d, z14.d\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" "zip1 z6.d, z11.d, z14.d\n" - "incd %[outpos], all, mul #1\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" "zip2 z7.d, z11.d, z14.d\n" + "incw %[outpos], all, mul #1\n" "zip1 z8.d, z0.d, z4.d\n" - "st1d z8.d, p0, [%[outptr]]\n" "zip2 z9.d, z0.d, z4.d\n" - "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n" "zip1 z10.d, z1.d, z5.d\n" - "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n" + "whilelt p4.s, %[outpos], %[outwidth]\n" "zip2 z11.d, z1.d, z5.d\n" - "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip1 z12.d, z2.d, z6.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" "zip2 z13.d, z2.d, z6.d\n" - "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z14.d, z3.d, z7.d\n" - "incd %[outpos], all, mul #1\n" "zip2 z15.d, z3.d, z7.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" - "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" - "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" - "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n" - "incd %[outpos], all, mul #1\n" + "whilelt p5.s, %[outpos], %[outwidth]\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" + "incw %[outpos], all, mul #1\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" + "whilelt p6.s, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" + "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" + "whilelt p7.s, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" + "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" + "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" + "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; - + case 4: __asm __volatile( "1:\n" "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z4.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0]]\n" + "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" + "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" + "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n" + "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n" + "incw %[inpos], all, mul #1\n" "zip1 z8.d, z0.d, z4.d\n" - "ld1w z1.s, p0/z, [%[inptr1]]\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z9.d, z0.d, z4.d\n" - "ld1w z2.s, p0/z, [%[inptr2]]\n" + "incw %[outpos], all, mul #1\n" "zip1 z10.d, z1.d, z4.d\n" - "ld1w z3.s, p0/z, [%[inptr3]]\n" "zip2 z11.d, z1.d, z4.d\n" - "incw %[inpos], all, mul #1\n" "zip1 z12.d, z2.d, z4.d\n" - "addvl %[inptr0], %[inptr0], #1\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip2 z13.d, z2.d, z4.d\n" - "addvl %[inptr1], %[inptr1], #1\n" + "incw %[outpos], all, mul #1\n" "zip1 z14.d, z3.d, z4.d\n" - "addvl %[inptr2], %[inptr2], #1\n" "zip2 z15.d, z3.d, z4.d\n" - "addvl %[inptr3], %[inptr3], #1\n" "zip1 z0.d, z8.d, z12.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip2 z1.d, z8.d, z12.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip1 z2.d, z9.d, z13.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" "zip2 z3.d, z9.d, z13.d\n" - "incd %[outpos], all, mul #1\n" "zip1 z4.d, z10.d, z14.d\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" "zip2 z5.d, z10.d, z14.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip1 z6.d, z11.d, z15.d\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" "zip2 z7.d, z11.d, z15.d\n" - "incd %[outpos], all, mul #1\n" "zip1 z8.d, z0.d, z4.d\n" - "st1d z8.d, p0, [%[outptr]]\n" + "whilelt p4.s, %[outpos], %[outwidth]\n" "zip2 z9.d, z0.d, z4.d\n" - "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n" + "incw %[outpos], all, mul #1\n" "zip1 z10.d, z1.d, z5.d\n" - "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z11.d, z1.d, z5.d\n" - "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n" "zip1 z12.d, z2.d, z6.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" + "whilelt p5.s, %[outpos], %[outwidth]\n" "zip2 z13.d, z2.d, z6.d\n" - "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z14.d, z3.d, z7.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip2 z15.d, z3.d, z7.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" - "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" - "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" - "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n" - "incd %[outpos], all, mul #1\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" + "whilelt p6.s, %[outpos], %[outwidth]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" + "incw %[outpos], all, mul #1\n" + "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" + "whilelt p7.s, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" + "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" + "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" + "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; - + case 5: __asm __volatile( "1:\n" "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z5.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0]]\n" - "ld1w z1.s, p0/z, [%[inptr1]]\n" + "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" + "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" + "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n" + "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n" + "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n" "incw %[inpos], all, mul #1\n" "zip1 z10.d, z1.d, z5.d\n" - "ld1w z2.s, p0/z, [%[inptr2]]\n" - "zip2 z11.d, z1.d, z5.d\n" - "ld1w z3.s, p0/z, [%[inptr3]]\n" - "zip1 z12.d, z2.d, z5.d\n" - "ld1w z4.s, p0/z, [%[inptr4]]\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip1 z8.d, z0.d, z4.d\n" - "addvl %[inptr0], %[inptr0], #1\n" + "incw %[outpos], all, mul #1\n" "zip2 z9.d, z0.d, z4.d\n" - "addvl %[inptr1], %[inptr1], #1\n" + "zip2 z11.d, z1.d, z5.d\n" + "zip1 z12.d, z2.d, z5.d\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip2 z13.d, z2.d, z5.d\n" - "addvl %[inptr2], %[inptr2], #1\n" + "incw %[outpos], all, mul #1\n" "zip1 z14.d, z3.d, z5.d\n" - "addvl %[inptr3], %[inptr3], #1\n" "zip2 z15.d, z3.d, z5.d\n" - "addvl %[inptr4], %[inptr4], #1\n" "zip1 z0.d, z8.d, z12.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip2 z1.d, z8.d, z12.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip1 z2.d, z9.d, z13.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" "zip2 z3.d, z9.d, z13.d\n" - "incd %[outpos], all, mul #1\n" "zip1 z4.d, z10.d, z14.d\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" "zip2 z5.d, z10.d, z14.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip1 z6.d, z11.d, z15.d\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" "zip2 z7.d, z11.d, z15.d\n" - "incd %[outpos], all, mul #1\n" "zip1 z8.d, z0.d, z4.d\n" - "st1d z8.d, p0, [%[outptr]]\n" + "whilelt p4.s, %[outpos], %[outwidth]\n" "zip2 z9.d, z0.d, z4.d\n" - "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n" + "incw %[outpos], all, mul #1\n" "zip1 z10.d, z1.d, z5.d\n" - "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z11.d, z1.d, z5.d\n" - "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n" "zip1 z12.d, z2.d, z6.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" + "whilelt p5.s, %[outpos], %[outwidth]\n" "zip2 z13.d, z2.d, z6.d\n" - "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z14.d, z3.d, z7.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip2 z15.d, z3.d, z7.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" - "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" - "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" - "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n" - "incd %[outpos], all, mul #1\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" + "whilelt p6.s, %[outpos], %[outwidth]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" + "incw %[outpos], all, mul #1\n" + "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" + "whilelt p7.s, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" + "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" + "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" + "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; - + case 6: __asm __volatile( "1:\n" "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z6.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0]]\n" - "ld1w z1.s, p0/z, [%[inptr1]]\n" + "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" + "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" + "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n" + "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n" + "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n" + "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n" "incw %[inpos], all, mul #1\n" - "ld1w z2.s, p0/z, [%[inptr2]]\n" - "addvl %[inptr0], %[inptr0], #1\n" "zip1 z12.d, z2.d, z6.d\n" - "ld1w z3.s, p0/z, [%[inptr3]]\n" - "zip2 z13.d, z2.d, z6.d\n" - "ld1w z4.s, p0/z, [%[inptr4]]\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip1 z8.d, z0.d, z4.d\n" - "ld1w z5.s, p0/z, [%[inptr5]]\n" + "incw %[outpos], all, mul #1\n" "zip2 z9.d, z0.d, z4.d\n" - "addvl %[inptr1], %[inptr1], #1\n" "zip1 z10.d, z1.d, z5.d\n" - "addvl %[inptr2], %[inptr2], #1\n" "zip2 z11.d, z1.d, z5.d\n" - "addvl %[inptr3], %[inptr3], #1\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" + "zip2 z13.d, z2.d, z6.d\n" + "incw %[outpos], all, mul #1\n" "zip1 z14.d, z3.d, z6.d\n" - "addvl %[inptr4], %[inptr4], #1\n" "zip2 z15.d, z3.d, z6.d\n" - "addvl %[inptr5], %[inptr5], #1\n" "zip1 z0.d, z8.d, z12.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip2 z1.d, z8.d, z12.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip1 z2.d, z9.d, z13.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" "zip2 z3.d, z9.d, z13.d\n" - "incd %[outpos], all, mul #1\n" "zip1 z4.d, z10.d, z14.d\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" "zip2 z5.d, z10.d, z14.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip1 z6.d, z11.d, z15.d\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" "zip2 z7.d, z11.d, z15.d\n" - "incd %[outpos], all, mul #1\n" "zip1 z8.d, z0.d, z4.d\n" - "st1d z8.d, p0, [%[outptr]]\n" + "whilelt p4.s, %[outpos], %[outwidth]\n" "zip2 z9.d, z0.d, z4.d\n" - "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n" + "incw %[outpos], all, mul #1\n" "zip1 z10.d, z1.d, z5.d\n" - "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z11.d, z1.d, z5.d\n" - "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n" "zip1 z12.d, z2.d, z6.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" + "whilelt p5.s, %[outpos], %[outwidth]\n" "zip2 z13.d, z2.d, z6.d\n" - "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z14.d, z3.d, z7.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip2 z15.d, z3.d, z7.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" - "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" - "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" - "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n" - "incd %[outpos], all, mul #1\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" + "whilelt p6.s, %[outpos], %[outwidth]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" + "incw %[outpos], all, mul #1\n" + "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" + "whilelt p7.s, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" + "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" + "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" + "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; - + case 7: __asm __volatile( "1:\n" "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z7.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0]]\n" - "ld1w z1.s, p0/z, [%[inptr1]]\n" + "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" + "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" + "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n" + "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n" + "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n" + "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n" + "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n" "incw %[inpos], all, mul #1\n" - "ld1w z2.s, p0/z, [%[inptr2]]\n" - "addvl %[inptr0], %[inptr0], #1\n" - "ld1w z3.s, p0/z, [%[inptr3]]\n" - "addvl %[inptr1], %[inptr1], #1\n" - "zip1 z14.d, z3.d, z7.d\n" - "ld1w z4.s, p0/z, [%[inptr4]]\n" "zip1 z8.d, z0.d, z4.d\n" - "ld1w z5.s, p0/z, [%[inptr5]]\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z9.d, z0.d, z4.d\n" - "ld1w z6.s, p0/z, [%[inptr6]]\n" + "incw %[outpos], all, mul #1\n" "zip1 z10.d, z1.d, z5.d\n" - "addvl %[inptr2], %[inptr2], #1\n" "zip2 z11.d, z1.d, z5.d\n" - "addvl %[inptr3], %[inptr3], #1\n" "zip1 z12.d, z2.d, z6.d\n" - "addvl %[inptr4], %[inptr4], #1\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip2 z13.d, z2.d, z6.d\n" - "addvl %[inptr5], %[inptr5], #1\n" + "incw %[outpos], all, mul #1\n" + "zip1 z14.d, z3.d, z7.d\n" "zip2 z15.d, z3.d, z7.d\n" - "addvl %[inptr6], %[inptr6], #1\n" "zip1 z0.d, z8.d, z12.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip2 z1.d, z8.d, z12.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip1 z2.d, z9.d, z13.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" "zip2 z3.d, z9.d, z13.d\n" - "incd %[outpos], all, mul #1\n" "zip1 z4.d, z10.d, z14.d\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" "zip2 z5.d, z10.d, z14.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip1 z6.d, z11.d, z15.d\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" "zip2 z7.d, z11.d, z15.d\n" - "incd %[outpos], all, mul #1\n" "zip1 z8.d, z0.d, z4.d\n" - "st1d z8.d, p0, [%[outptr]]\n" + "whilelt p4.s, %[outpos], %[outwidth]\n" "zip2 z9.d, z0.d, z4.d\n" - "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n" + "incw %[outpos], all, mul #1\n" "zip1 z10.d, z1.d, z5.d\n" - "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z11.d, z1.d, z5.d\n" - "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n" "zip1 z12.d, z2.d, z6.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" + "whilelt p5.s, %[outpos], %[outwidth]\n" "zip2 z13.d, z2.d, z6.d\n" - "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z14.d, z3.d, z7.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip2 z15.d, z3.d, z7.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" - "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" - "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" - "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n" - "incd %[outpos], all, mul #1\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" + "whilelt p6.s, %[outpos], %[outwidth]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" + "incw %[outpos], all, mul #1\n" + "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" + "whilelt p7.s, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" + "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" + "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" + "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; - + default: case 8: __asm __volatile( "1:\n" "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" - "ld1w z0.s, p0/z, [%[inptr0]]\n" + "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" + "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" + "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n" + "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n" + "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n" + "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n" + "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n" + "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n" "incw %[inpos], all, mul #1\n" - "ld1w z1.s, p0/z, [%[inptr1]]\n" - "addvl %[inptr0], %[inptr0], #1\n" - "ld1w z2.s, p0/z, [%[inptr2]]\n" - "addvl %[inptr1], %[inptr1], #1\n" - "ld1w z3.s, p0/z, [%[inptr3]]\n" - "addvl %[inptr2], %[inptr2], #1\n" - "ld1w z4.s, p0/z, [%[inptr4]]\n" - "addvl %[inptr3], %[inptr3], #1\n" "zip1 z8.d, z0.d, z4.d\n" - "ld1w z5.s, p0/z, [%[inptr5]]\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z9.d, z0.d, z4.d\n" - "ld1w z6.s, p0/z, [%[inptr6]]\n" + "incw %[outpos], all, mul #1\n" "zip1 z10.d, z1.d, z5.d\n" - "ld1w z7.s, p0/z, [%[inptr7]]\n" "zip2 z11.d, z1.d, z5.d\n" - "addvl %[inptr4], %[inptr4], #1\n" "zip1 z12.d, z2.d, z6.d\n" - "addvl %[inptr5], %[inptr5], #1\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip2 z13.d, z2.d, z6.d\n" - "addvl %[inptr6], %[inptr6], #1\n" + "incw %[outpos], all, mul #1\n" "zip1 z14.d, z3.d, z7.d\n" - "addvl %[inptr7], %[inptr7], #1\n" "zip2 z15.d, z3.d, z7.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" "zip1 z0.d, z8.d, z12.d\n" - "incd %[outpos], all, mul #1\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip2 z1.d, z8.d, z12.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" "zip1 z2.d, z9.d, z13.d\n" - "incd %[outpos], all, mul #1\n" "zip2 z3.d, z9.d, z13.d\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" "zip1 z4.d, z10.d, z14.d\n" - "incd %[outpos], all, mul #1\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" "zip2 z5.d, z10.d, z14.d\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" "zip1 z6.d, z11.d, z15.d\n" - "incd %[outpos], all, mul #1\n" "zip2 z7.d, z11.d, z15.d\n" "zip1 z8.d, z0.d, z4.d\n" - "st1d z8.d, p0, [%[outptr]]\n" + "whilelt p4.s, %[outpos], %[outwidth]\n" "zip2 z9.d, z0.d, z4.d\n" - "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n" + "incw %[outpos], all, mul #1\n" "zip1 z10.d, z1.d, z5.d\n" - "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z11.d, z1.d, z5.d\n" - "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n" "zip1 z12.d, z2.d, z6.d\n" - "whilelt p0.d, %[outpos], %[outwidth]\n" + "whilelt p5.s, %[outpos], %[outwidth]\n" "zip2 z13.d, z2.d, z6.d\n" - "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z14.d, z3.d, z7.d\n" - "incd %[outpos], all, mul #1\n" + "incw %[outpos], all, mul #1\n" "zip2 z15.d, z3.d, z7.d\n" - "whilelt p1.d, %[outpos], %[outwidth]\n" - "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p2.d, %[outpos], %[outwidth]\n" - "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n" - "incd %[outpos], all, mul #1\n" - "whilelt p3.d, %[outpos], %[outwidth]\n" - "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n" - "incd %[outpos], all, mul #1\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" + "whilelt p6.s, %[outpos], %[outwidth]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" + "incw %[outpos], all, mul #1\n" + "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" + "whilelt p7.s, %[outpos], %[outwidth]\n" + "incw %[outpos], all, mul #1\n" + "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" + "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" + "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; - - + + } } } diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp index a96a43cbeb..ed0d58aa91 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 - 2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp index 7dbbe91ba2..c49db2cb16 100644 --- a/src/core/NEON/kernels/arm_gemm/utils.hpp +++ b/src/core/NEON/kernels/arm_gemm/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -32,8 +32,6 @@ // Paranoid option for the above with assert // #define UNREACHABLE(why) assert(0 && why) -#define UNUSED(x) (void)(x) - template inline T iceildiv(const T a, const T b) { return (a + b - 1) / b; diff --git a/src/core/NEON/kernels/assembly/arm_gemm.hpp b/src/core/NEON/kernels/assembly/arm_gemm.hpp index 7723224ec8..2df7132500 100644 --- a/src/core/NEON/kernels/assembly/arm_gemm.hpp +++ b/src/core/NEON/kernels/assembly/arm_gemm.hpp @@ -23,14 +23,14 @@ */ #pragma once -#include #include +#include #include "arm_gemm_local.hpp" #include "gemm_common.hpp" -namespace arm_gemm { - +namespace arm_gemm +{ enum class GemmMethod { DEFAULT, @@ -47,12 +47,17 @@ enum class GemmMethod struct KernelDescription { - GemmMethod method = GemmMethod::DEFAULT; - std::string name = ""; - bool is_default = false; + GemmMethod method = GemmMethod::DEFAULT; + std::string name = ""; + bool is_default = false; - KernelDescription(GemmMethod m, std::string n, bool d=false) : method(m), name(n), is_default(d) { } - KernelDescription() noexcept { } + KernelDescription(GemmMethod m, std::string n, bool d = false) + : method(m), name(n), is_default(d) + { + } + KernelDescription() noexcept + { + } }; struct GemmConfig @@ -62,23 +67,32 @@ struct GemmConfig unsigned int inner_block_size = 0; unsigned int outer_block_size = 0; - GemmConfig(GemmMethod method) : method(method) { } - GemmConfig() { } + GemmConfig(GemmMethod method) + : method(method) + { + } + GemmConfig() + { + } }; struct Activation { - enum class Type { + enum class Type + { None, ReLU, BoundedReLU }; - Type type; - float param1; - float param2; + Type type; + float param1; + float param2; - Activation(Type type=Type::None, float p1=0.0f, float p2=0.0f) : type(type), param1(p1), param2(p2) { } + Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) + : type(type), param1(p1), param2(p2) + { + } }; struct GemmArgs @@ -101,10 +115,8 @@ public: const unsigned int K, const unsigned int nbatches, const unsigned int nmulti, const bool trA, const bool trB, Activation act, const int maxthreads, - const bool pretransposed_hint, const GemmConfig *cfg=nullptr ) : - _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), - _trA(trA), _trB(trB), _act(act), _maxthreads(maxthreads), - _pretransposed_hint(pretransposed_hint), _cfg(cfg) + const bool pretransposed_hint, const GemmConfig *cfg = nullptr) + : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), _trA(trA), _trB(trB), _act(act), _maxthreads(maxthreads), _pretransposed_hint(pretransposed_hint), _cfg(cfg) { } }; @@ -112,18 +124,18 @@ public: struct Requantize32 { public: - const int32_t *bias = nullptr; - size_t bias_multi_stride = 0; - int32_t a_offset = 0; - int32_t b_offset = 0; - int32_t c_offset = 0; - bool per_channel_requant = false; - int32_t per_layer_shift = 0; - int32_t per_layer_mul = 0; - const int32_t *per_channel_shifts = nullptr; - const int32_t *per_channel_muls = nullptr; - int32_t minval = 0; - int32_t maxval = 0; + const int32_t *bias = nullptr; + size_t bias_multi_stride = 0; + int32_t a_offset = 0; + int32_t b_offset = 0; + int32_t c_offset = 0; + bool per_channel_requant = false; + int32_t per_layer_shift = 0; + int32_t per_layer_mul = 0; + const int32_t *per_channel_shifts = nullptr; + const int32_t *per_channel_muls = nullptr; + int32_t minval = 0; + int32_t maxval = 0; Requantize32() = default; @@ -131,11 +143,9 @@ public: Requantize32(const int32_t *bias, size_t bias_multi_stride, int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t requant_shift, int32_t requant_mul, - int32_t minv, int32_t maxv) : - bias(bias), bias_multi_stride(bias_multi_stride), - a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), - per_channel_requant(false), per_layer_shift(requant_shift), per_layer_mul(requant_mul), - minval(minv), maxval(maxv) + int32_t minv, int32_t maxv) + : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_shift(requant_shift), per_layer_mul(requant_mul), + minval(minv), maxval(maxv) { } @@ -143,11 +153,9 @@ public: Requantize32(const int32_t *bias, size_t bias_multi_stride, int32_t a_offset, int32_t b_offset, int32_t c_offset, const int32_t *requant_shifts, const int32_t *requant_muls, - int32_t minv, int32_t maxv) : - bias(bias), bias_multi_stride(bias_multi_stride), - a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), - per_channel_requant(true), per_channel_shifts(requant_shifts), per_channel_muls(requant_muls), - minval(minv), maxval(maxv) + int32_t minv, int32_t maxv) + : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_shifts(requant_shifts), + per_channel_muls(requant_muls), minval(minv), maxval(maxv) { } }; @@ -156,21 +164,21 @@ struct Nothing { }; -template -using UniqueGemmCommon = std::unique_ptr >; +template +using UniqueGemmCommon = std::unique_ptr>; /* Low level API calls. * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */ /* get_gemm_method(): Given the templated types and provided parameters, * which is the preferred method to implement this GEMM? */ -template -KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & ={}); +template +KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {}); -template -UniqueGemmCommon gemm(const GemmArgs &args, const OutputStage & ={}); +template +UniqueGemmCommon gemm(const GemmArgs &args, const OutputStage & = {}); -template -std::vector get_compatible_kernels(const GemmArgs &args, const OutputStage & ={}); +template +std::vector get_compatible_kernels(const GemmArgs &args, const OutputStage & = {}); } // namespace arm_gemm diff --git a/src/core/NEON/kernels/assembly/gemm_common.hpp b/src/core/NEON/kernels/assembly/gemm_common.hpp index a44b774b9d..3b4c025371 100644 --- a/src/core/NEON/kernels/assembly/gemm_common.hpp +++ b/src/core/NEON/kernels/assembly/gemm_common.hpp @@ -23,15 +23,12 @@ */ #pragma once -#include "arm_gemm_compute_iface.hpp" +#include "ndrange.hpp" #include -#include - -#define UNUSED(x) (void)(x) - -namespace arm_gemm { +namespace arm_gemm +{ // Abstract class for the GEMM/GEMV functions. // // GEMM implementations may be "native" (never require any input @@ -41,7 +38,8 @@ namespace arm_gemm { // The real GemmCommon class is templated based on the operand and return // type. This is an interface class which is independent of those types. -class IGemmCommon { +class IGemmCommon +{ public: /* Pass in the pointers to the arrays to be operated on and their * strides. This "generic" version uses void *s, the preferred version @@ -50,9 +48,9 @@ public: * the settings for B here are ignored. */ virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const void *B, const int ldb, /* batches share B */ const int B_multi_stride, - void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0; + const void *B, const int ldb, /* batches share B */ const int B_multi_stride, + void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, + const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0; /** @returns an ndrange containing ranges of the compute space which can be * broken up and parallelised over @@ -71,47 +69,64 @@ public: * This has an empty default implementation, as GEMMs which don't care * about thread count can safely ignore this. */ - virtual void set_nthreads(int) { }; + virtual void set_nthreads(int) {}; /* Whether this GEMM can be dynamically scheduled or not. */ - virtual bool supports_dynamic_scheduling() const { return false; } + virtual bool supports_dynamic_scheduling() const + { + return false; + } /** Main execute member fucntion * @param [in] work_range specifies the range of work we want to be computed, total range defined by get_window_size() * @param [in] thread_locator where are we inside of the thread space * @naram [in] threadid a unique threadid */ - virtual void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) = 0; + virtual void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) = 0; /*** Working space interface (optional) ***/ /* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */ - virtual size_t get_working_size() const { return 0; } + virtual size_t get_working_size() const + { + return 0; + } /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */ - virtual void set_working_space(void *) { }; + virtual void set_working_space(void *) {}; /*** "Pretransposed" interface (optional) ***/ /* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */ - virtual bool B_is_pretransposed() const { return false; } + virtual bool B_is_pretransposed() const + { + return false; + } /* Does pretranspose still need to be done? */ - virtual bool B_pretranspose_required() const { return false; } + virtual bool B_pretranspose_required() const + { + return false; + } /* Total number of bytes of space needed for pretransposed arrays. */ - virtual size_t get_B_pretransposed_array_size() const { return 0; } + virtual size_t get_B_pretransposed_array_size() const + { + return 0; + } /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */ /* The "real" version of this depends on the templated operand type (see below). */ virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0; /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */ - virtual void set_pretransposed_B_data(void *) { } + virtual void set_pretransposed_B_data(void *) + { + } /*** "Quantized bias" interface (optional) ***/ /* Set the bias vector for quantized GEMMs */ - virtual void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) + virtual void set_quantized_bias(const int32_t *, size_t) { - UNUSED(bias); - UNUSED(bias_multi_stride); } // Destructor - virtual ~IGemmCommon() { } + virtual ~IGemmCommon() + { + } }; /* "Real" GemmCommon class which is templated on the operand and return types. @@ -121,50 +136,53 @@ public: * 'set_arrays' to capture the provided arguments in protected class * members, as essentially any implementation will need these. */ -template -class GemmCommon : public IGemmCommon { +template +class GemmCommon : public IGemmCommon +{ protected: - const To *_Aptr=nullptr; - int _lda=0; - int _A_batch_stride=0; - int _A_multi_stride=0; - const To *_Bptr=nullptr; - int _ldb=0; - int _B_multi_stride=0; - Tr *_Cptr=nullptr; - int _ldc=0; - int _C_batch_stride=0; - int _C_multi_stride=0; - const Tr *_bias=nullptr; - int _bias_multi_stride=0; + const To *_Aptr = nullptr; + int _lda = 0; + int _A_batch_stride = 0; + int _A_multi_stride = 0; + const To *_Bptr = nullptr; + int _ldb = 0; + int _B_multi_stride = 0; + Tr *_Cptr = nullptr; + int _ldc = 0; + int _C_batch_stride = 0; + int _C_multi_stride = 0; + const Tr *_bias = nullptr; + int _bias_multi_stride = 0; public: /* Pass in the pointers to the arrays to be operated on and their * strides (templated version with appropriate types). */ virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const To *B, const int ldb, /* batches share B */ const int B_multi_stride, - Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride) { - _Aptr = A; - _lda = lda; - _A_batch_stride = A_batch_stride; - _A_multi_stride = A_multi_stride; - _Bptr = B; - _ldb = ldb; - _B_multi_stride = B_multi_stride; - _Cptr = C; - _ldc = ldc; - _C_batch_stride = C_batch_stride; - _C_multi_stride = C_multi_stride; - _bias = bias; + const To *B, const int ldb, /* batches share B */ const int B_multi_stride, + Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride, + const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride) + { + _Aptr = A; + _lda = lda; + _A_batch_stride = A_batch_stride; + _A_multi_stride = A_multi_stride; + _Bptr = B; + _ldb = ldb; + _B_multi_stride = B_multi_stride; + _Cptr = C; + _ldc = ldc; + _C_batch_stride = C_batch_stride; + _C_multi_stride = C_multi_stride; + _bias = bias; _bias_multi_stride = bias_multi_stride; } /* Implementation of the void * overload which casts its arguments to the appropriate type. */ void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const void *B, const int ldb, /* batches share B */ const int B_multi_stride, - void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override { + const void *B, const int ldb, /* batches share B */ const int B_multi_stride, + void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, + const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override + { set_arrays(static_cast(A), lda, A_batch_stride, A_multi_stride, static_cast(B), ldb, B_multi_stride, static_cast(C), ldc, C_batch_stride, C_multi_stride, @@ -175,27 +193,13 @@ public: /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */ - virtual void pretranspose_B_array(void *, const To *, const int, const int) { }; + virtual void pretranspose_B_array(void *, const To *, const int, const int) {}; /* Implementation of the void * overload which casts its arguments to the appropriate type. */ - void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override { + void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override + { pretranspose_B_array(out, static_cast(in), row_stride, multi_stride); } }; -template -inline -int unsigned get_total_window_size(const GemmKernel& kernel) -{ - auto window=kernel.get_window_size(); - - unsigned int total = 1; - for(unsigned i = 0; i != arm_gemm::ndrange_max; ++i) - { - total *= window.get_size(i); - } - - return total; -} - } // namespace arm_gemm diff --git a/src/core/NEON/kernels/assembly/ndrange.hpp b/src/core/NEON/kernels/assembly/ndrange.hpp index d082a3e9b8..86638298ab 100644 --- a/src/core/NEON/kernels/assembly/ndrange.hpp +++ b/src/core/NEON/kernels/assembly/ndrange.hpp @@ -23,104 +23,123 @@ */ #pragma once -#include #include -#include - +#include #include +#include -namespace arm_gemm { - -template -class NDRange { +namespace arm_gemm +{ +template +class NDRange +{ private: - std::array m_sizes {}; - std::array m_totalsizes {}; + std::array m_sizes{}; + std::array m_totalsizes{}; - class NDRangeIterator { + class NDRangeIterator + { private: const NDRange &m_parent; - unsigned int m_pos = 0; - unsigned int m_end = 0; + unsigned int m_pos = 0; + unsigned int m_end = 0; public: - NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { } + NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) + : m_parent(p), m_pos(s), m_end(e) + { + } - bool done() const { + bool done() const + { return (m_pos >= m_end); } - unsigned int dim(unsigned int d) const { + unsigned int dim(unsigned int d) const + { unsigned int r = m_pos; - if (d < (D - 1)) { + if(d < (D - 1)) + { r %= m_parent.m_totalsizes[d]; } - if (d > 0) { - r /= m_parent.m_totalsizes[d-1]; + if(d > 0) + { + r /= m_parent.m_totalsizes[d - 1]; } return r; } - bool next_dim0() { + bool next_dim0() + { m_pos++; return !done(); } - bool next_dim1() { + bool next_dim1() + { m_pos += m_parent.m_sizes[0] - dim(0); return !done(); } - unsigned int dim0_max() const { + unsigned int dim0_max() const + { unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0)); return dim(0) + offset; } }; -public: - NDRange& operator=(const NDRange& rhs)=default; - NDRange(const NDRange& rhs) =default; - - template - NDRange(T... ts) - : m_sizes{ts...} + void set_totalsizes() { - unsigned int t=1; + unsigned int t = 1; + + for(unsigned int i = 0; i < D; i++) + { + if(m_sizes[i] == 0) + { + m_sizes[i] = 1; + } - for (unsigned int i=0; i& n) - : m_sizes(n) - { - unsigned int t=1; +public: + NDRange &operator=(const NDRange &rhs) = default; + NDRange(const NDRange &rhs) = default; - for (unsigned int i=0; i + NDRange(T... ts) + : m_sizes{ ts... } + { + set_totalsizes(); + } - m_totalsizes[i] = t; - } + NDRange(const std::array &n) + : m_sizes(n) + { + set_totalsizes(); } - NDRangeIterator iterator(unsigned int start, unsigned int end) const { + NDRangeIterator iterator(unsigned int start, unsigned int end) const + { return NDRangeIterator(*this, start, end); } - unsigned int total_size() const { + unsigned int total_size() const + { return m_totalsizes[D - 1]; } - unsigned int get_size(unsigned int v) const { + unsigned int get_size(unsigned int v) const + { return m_sizes[v]; } }; @@ -128,58 +147,53 @@ public: /** NDCoordinate builds upon a range, but specifies a starting position * in addition to a size which it inherits from NDRange */ -template -class NDCoordinate : public NDRange { - using int_t =unsigned int; +template +class NDCoordinate : public NDRange +{ + using int_t = unsigned int; using ndrange_t = NDRange; - std::array m_positions {}; + std::array m_positions{}; + public: - NDCoordinate& operator=(const NDCoordinate& rhs)=default; - NDCoordinate(const NDCoordinate& rhs) =default; - NDCoordinate(const std::initializer_list>& list) + NDCoordinate &operator=(const NDCoordinate &rhs) = default; + NDCoordinate(const NDCoordinate &rhs) = default; + NDCoordinate(const std::initializer_list> &list) { std::array sizes{}; std::size_t i = 0; - for(auto& p : list) { - m_positions[i]= p.first; - sizes[i++] = p.second; + for(auto &p : list) + { + m_positions[i] = p.first; + sizes[i++] = p.second; } //update the parents sizes - static_cast(*this) = ndrange_t(sizes); + static_cast(*this) = ndrange_t(sizes); } - int_t get_position(int_t d) const { - assert(d < m_positions.size()); + int_t get_position(int_t d) const + { + assert(d < N); + return m_positions[d]; } - void set_position(int_t d, int_t v) { - assert(d < size(m_positions)); - assert(v < ndrange_t::get_size(d)); + void set_position(int_t d, int_t v) + { + assert(d < N); m_positions[d] = v; } - int_t get_position_end(int_t d) const { - return get_position(d) + NDRange::get_size(d); + int_t get_position_end(int_t d) const + { + return get_position(d) + ndrange_t::get_size(d); } }; //class NDCoordinate -/** @returns the number of dimensions in the NDRange which have none-1 values - * IE there is actual work in these dimensions that can be broken up - */ -template -std::size_t ndrange_popcount(const NDRange& ndr) { - std::size_t count = 0; - - for(unsigned int d = 0; d != N; ++d) { - if(ndr.get_size(d) != 1) - ++count; - } - return count; -} +using ndrange_t = NDRange<6>; +using ndcoord_t = NDCoordinate<6>; } // namespace arm_gemm diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp index d8f01a9066..e874f0f14b 100644 --- a/src/runtime/CPUUtils.cpp +++ b/src/runtime/CPUUtils.cpp @@ -73,6 +73,7 @@ bool model_supports_dot(CPUModel model) { case CPUModel::GENERIC_FP16_DOT: case CPUModel::A55r1: + case CPUModel::X1: return true; default: return false; @@ -86,6 +87,7 @@ bool model_supports_fp16(CPUModel model) case CPUModel::GENERIC_FP16: case CPUModel::GENERIC_FP16_DOT: case CPUModel::A55r1: + case CPUModel::X1: return true; default: return false; @@ -121,6 +123,9 @@ CPUModel midr_to_model(const unsigned int midr) model = CPUModel::A55r0; } break; + case 0xd44: // X1 + model = CPUModel::X1; + break; case 0xd0a: // A75 if(variant != 0) { diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp index 1fcac58e10..8a2506f39a 100644 --- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp +++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp @@ -284,7 +284,7 @@ void Fallback::configure(const ITensor *a, c //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 { - const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm); + const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); if(window_size < static_cast(args._maxthreads)) { _gemm_kernel_asm->set_nthreads(window_size); @@ -408,7 +408,7 @@ void Fallback::run() if(_workspace.buffer() != nullptr) { _gemm_kernel_asm->set_working_space(reinterpret_cast(_workspace.buffer())); - const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm); + const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); unsigned int num_threads = NEScheduler::get().num_threads(); if(window_size < num_threads) { -- cgit v1.2.1