From eb82fd2aa786715c3b6a941dc6d6deac4ce8e2a0 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Fri, 23 Feb 2018 13:43:50 +0000 Subject: COMPMID-881: RSH new arm_gemm interface. Change-Id: I1e2a1a77097d8017c274af3f97eba6964f80f5fa Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/122592 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- SConscript | 7 +- SConstruct | 6 +- arm_compute/core/NEON/NEKernels.h | 8 - .../core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h | 48 -- .../core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h | 48 -- .../NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h | 48 -- .../kernels/arm64/NEGEMMLowpAArch64A53Kernel.h | 60 -- .../NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h | 61 -- .../kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h | 62 -- .../core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h | 48 -- .../NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h | 48 -- .../NEON/kernels/assembly/NEGEMMAssemblyWrapper.h | 91 ++ .../core/NEON/kernels/assembly/arm_gemm.hpp | 39 + .../core/NEON/kernels/assembly/arm_gemm_local.hpp | 29 + arm_compute/core/NEON/kernels/assembly/asmlib.hpp | 121 --- .../core/NEON/kernels/assembly/gemm_common.hpp | 79 +- .../NEON/kernels/assembly/gemm_interleaved.hpp | 177 ---- .../core/NEON/kernels/assembly/gemv_transposed.hpp | 101 --- .../kernels/assembly/kernels/a32_sgemm_8x6.hpp | 82 -- .../kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp | 410 --------- .../assembly/kernels/a32_sgemm_8x6/a55r1.hpp | 413 ---------- .../assembly/kernels/a32_sgemm_8x6/generic.hpp | 350 -------- .../kernels/assembly/kernels/a64_gemm_s16_12x8.hpp | 68 -- .../assembly/kernels/a64_gemm_s16_12x8/generic.hpp | 313 ------- .../kernels/assembly/kernels/a64_gemm_s8_12x8.hpp | 61 -- .../assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp | 398 --------- .../a64_gemm_s8_12x8/dot_toolchain_support.h | 66 -- .../assembly/kernels/a64_gemm_s8_12x8/generic.hpp | 363 -------- .../kernels/assembly/kernels/a64_gemm_s8_4x4.hpp | 61 -- .../assembly/kernels/a64_gemm_s8_4x4/generic.hpp | 465 ----------- .../kernels/assembly/kernels/a64_gemm_u16_12x8.hpp | 68 -- .../assembly/kernels/a64_gemm_u16_12x8/generic.hpp | 314 ------- .../kernels/assembly/kernels/a64_gemm_u8_12x8.hpp | 65 -- .../assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp | 396 --------- .../a64_gemm_u8_12x8/dot_toolchain_support.h | 66 -- .../assembly/kernels/a64_gemm_u8_12x8/generic.hpp | 354 -------- .../kernels/assembly/kernels/a64_gemm_u8_4x4.hpp | 61 -- .../assembly/kernels/a64_gemm_u8_4x4/generic.hpp | 281 ------- .../kernels/assembly/kernels/a64_hgemm_24x8.hpp | 67 -- .../assembly/kernels/a64_hgemm_24x8/a55r1.hpp | 384 --------- .../assembly/kernels/a64_hgemm_24x8/generic.hpp | 337 -------- .../kernels/assembly/kernels/a64_sgemm_12x8.hpp | 81 -- .../assembly/kernels/a64_sgemm_12x8/a53.hpp | 368 --------- .../assembly/kernels/a64_sgemm_12x8/a55.hpp | 368 --------- .../assembly/kernels/a64_sgemm_12x8/a55r1.hpp | 360 -------- .../assembly/kernels/a64_sgemm_12x8/generic.hpp | 358 -------- .../kernels/assembly/kernels/a64_sgemv_trans.hpp | 50 -- .../core/NEON/kernels/assembly/kernels/generic.hpp | 913 --------------------- .../core/NEON/kernels/assembly/mergeresults.hpp | 59 -- .../assembly/merges/a32_merge_float_8x6.hpp | 170 ---- .../assembly/merges/a64_merge_float_12x8.hpp | 236 ------ .../core/NEON/kernels/assembly/merges/list.hpp | 28 - .../core/NEON/kernels/assembly/newgemm_lib.hpp | 410 +++++++++ .../core/NEON/kernels/assembly/profiler.hpp | 103 --- .../core/NEON/kernels/assembly/transform.hpp | 110 --- .../transforms/a32_interleave_6way_32bit.hpp | 152 ---- .../a32_transpose_interleave_8way_32bit.hpp | 127 --- .../transforms/a64_block16_interleave4_8bit.hpp | 120 --- .../transforms/a64_interleave_8way_16bit.hpp | 162 ---- .../transforms/a64_interleave_8way_32bit.hpp | 173 ---- .../a64_interleave_8way_half_to_float.hpp | 189 ----- .../a64_transpose_interleave_12way_16bit.hpp | 145 ---- ...64_transpose_interleave_12way_half_to_float.hpp | 120 --- .../a64_transpose_interleave_24way_16bit.hpp | 130 --- .../core/NEON/kernels/assembly/transforms/list.hpp | 33 - .../transforms/transpose_interleave_common.hpp | 139 ---- arm_compute/runtime/NEON/AssemblyHelper.h | 173 ++++ arm_compute/runtime/NEON/functions/NEGEMM.h | 29 +- .../NEON/functions/NEGEMMConvolutionLayer.h | 11 +- .../NEGEMMLowpAssemblyMatrixMultiplyCore.h | 5 +- .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 6 +- docs/00_introduction.dox | 12 +- examples/graph_inception_v4.cpp | 12 +- scripts/check_bad_style.sh | 16 +- scripts/clang_tidy_rules.py | 5 + src/core/GLES_COMPUTE/GCKernelLibrary.cpp | 4 +- .../NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp | 127 --- .../NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp | 127 --- .../kernels/arm64/NEGEMMAArch64NativeKernel.cpp | 121 --- .../kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp | 201 ----- .../NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp | 199 ----- .../kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp | 198 ----- .../NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp | 130 --- .../kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp | 132 --- src/core/NEON/kernels/arm_gemm/asmlib.hpp | 128 +++ src/core/NEON/kernels/arm_gemm/buffer_manager.hpp | 379 +++++++++ src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp | 65 ++ src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp | 85 ++ src/core/NEON/kernels/arm_gemm/gemm_int16.cpp | 48 ++ src/core/NEON/kernels/arm_gemm/gemm_int8.cpp | 63 ++ .../NEON/kernels/arm_gemm/gemm_interleaved.hpp | 535 ++++++++++++ src/core/NEON/kernels/arm_gemm/gemm_native.hpp | 102 +++ src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp | 48 ++ src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp | 64 ++ .../kernels/arm_gemm/gemv_native_transposed.hpp | 107 +++ .../NEON/kernels/arm_gemm/gemv_pretransposed.hpp | 146 ++++ .../kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp | 88 ++ .../kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp | 400 +++++++++ .../arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp | 398 +++++++++ .../arm_gemm/kernels/a32_sgemm_8x6/generic.cpp | 346 ++++++++ .../kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp | 73 ++ .../arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp | 309 +++++++ .../kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp | 72 ++ .../arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp | 356 ++++++++ .../a64_gemm_s8_12x8/dot_toolchain_support.h | 66 ++ .../arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp | 343 ++++++++ .../kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp | 67 ++ .../arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp | 456 ++++++++++ .../kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp | 73 ++ .../arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp | 309 +++++++ .../kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp | 72 ++ .../arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp | 356 ++++++++ .../a64_gemm_u8_12x8/dot_toolchain_support.h | 66 ++ .../arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp | 343 ++++++++ .../kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp | 66 ++ .../arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp | 273 ++++++ .../kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp | 74 ++ .../arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp | 360 ++++++++ .../arm_gemm/kernels/a64_hgemm_24x8/generic.cpp | 337 ++++++++ .../kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp | 95 +++ .../arm_gemm/kernels/a64_sgemm_12x8/a53.cpp | 363 ++++++++ .../arm_gemm/kernels/a64_sgemm_12x8/a55.cpp | 356 ++++++++ .../arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp | 342 ++++++++ .../arm_gemm/kernels/a64_sgemm_12x8/generic.cpp | 350 ++++++++ .../arm_gemm/kernels/a64_sgemm_native_16x4.hpp | 64 ++ .../kernels/a64_sgemm_native_16x4/generic.cpp | 734 +++++++++++++++++ .../arm_gemm/kernels/a64_sgemv_pretransposed.hpp | 67 ++ .../kernels/a64_sgemv_pretransposed/generic.cpp | 794 ++++++++++++++++++ .../kernels/arm_gemm/kernels/a64_sgemv_trans.hpp | 55 ++ .../arm_gemm/kernels/a64_sgemv_trans/generic.cpp | 913 +++++++++++++++++++++ src/core/NEON/kernels/arm_gemm/mergeresults.hpp | 75 ++ .../arm_gemm/merges/a32_merge_float_8x6.hpp | 167 ++++ .../arm_gemm/merges/a64_merge_float_12x8.hpp | 233 ++++++ .../merges/a64_merge_float_to_half_12x8.hpp | 273 ++++++ .../arm_gemm/merges/a64_merge_half_24x8.hpp | 233 ++++++ .../arm_gemm/merges/a64_merge_int32_12x8.hpp | 289 +++++++ src/core/NEON/kernels/arm_gemm/merges/list.hpp | 29 + src/core/NEON/kernels/arm_gemm/misc.cpp | 147 ++++ src/core/NEON/kernels/arm_gemm/profiler.hpp | 133 +++ src/core/NEON/kernels/arm_gemm/transform.hpp | 122 +++ .../transforms/a32_interleave_6way_32bit.hpp | 152 ++++ .../a32_transpose_interleave_8way_32bit.hpp | 116 +++ .../transforms/a64_block16_interleave4_8bit.hpp | 131 +++ .../transforms/a64_interleave_8way_16bit.hpp | 170 ++++ .../transforms/a64_interleave_8way_32bit.hpp | 175 ++++ .../a64_interleave_8way_half_to_float.hpp | 192 +++++ .../a64_transpose_interleave_12way_16bit.hpp | 135 +++ ...64_transpose_interleave_12way_half_to_float.hpp | 113 +++ .../a64_transpose_interleave_24way_16bit.hpp | 121 +++ src/core/NEON/kernels/arm_gemm/transforms/list.hpp | 33 + .../transforms/transpose_interleave_common.hpp | 160 ++++ src/core/NEON/kernels/arm_gemm/utils.hpp | 51 ++ src/runtime/NEON/functions/NEGEMM.cpp | 118 +-- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 79 +- .../NEGEMMLowpAssemblyMatrixMultiplyCore.cpp | 117 +-- .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 120 ++- tests/validation/NEON/GEMMLowp.cpp | 3 +- 157 files changed, 15439 insertions(+), 12590 deletions(-) delete mode 100644 arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h delete mode 100644 arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h delete mode 100644 arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h delete mode 100644 arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h delete mode 100644 arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h delete mode 100644 arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h delete mode 100644 arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h delete mode 100644 arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h create mode 100644 arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h create mode 100644 arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/asmlib.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/mergeresults.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/merges/list.hpp create mode 100644 arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/profiler.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/transform.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/a64_block16_interleave4_8bit.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/list.hpp delete mode 100644 arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp create mode 100644 arm_compute/runtime/NEON/AssemblyHelper.h delete mode 100644 src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp delete mode 100644 src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp delete mode 100644 src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp delete mode 100644 src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp delete mode 100644 src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp delete mode 100644 src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp delete mode 100644 src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp delete mode 100644 src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/asmlib.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/buffer_manager.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/gemm_int16.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/gemm_int8.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/gemm_native.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/mergeresults.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/merges/list.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/misc.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/profiler.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transform.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/list.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/utils.hpp diff --git a/SConscript b/SConscript index dab807a47f..cf2dd878e5 100644 --- a/SConscript +++ b/SConscript @@ -194,6 +194,8 @@ if env['neon']: core_files += Glob('src/core/NEON/*.cpp') core_files += Glob('src/core/NEON/kernels/*.cpp') + core_files += Glob('src/core/NEON/kernels/arm_gemm/*.cpp') + # build winograd sources for either v7a / v8a core_files += Glob('src/core/NEON/kernels/convolution/*/*.cpp') core_files += Glob('src/core/NEON/kernels/convolution/winograd/*/*.cpp') @@ -202,10 +204,11 @@ if env['neon']: graph2_files += Glob('src/graph2/backends/NEON/*.cpp') if env['arch'] == "armv7a": - core_files += Glob('src/core/NEON/kernels/arm32/*.cpp') + core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a32_*/*.cpp') + if "arm64-v8" in env['arch']: - core_files += Glob('src/core/NEON/kernels/arm64/*.cpp') + core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a64_*/*.cpp') runtime_files += Glob('src/runtime/NEON/*.cpp') runtime_files += Glob('src/runtime/NEON/functions/*.cpp') diff --git a/SConstruct b/SConstruct index 7667132bd9..4bf90c107d 100644 --- a/SConstruct +++ b/SConstruct @@ -135,19 +135,19 @@ if env['arch'] == 'armv7a': env.Append(CXXFLAGS = ['-mfloat-abi=softfp']) elif env['arch'] == 'arm64-v8a': env.Append(CXXFLAGS = ['-march=armv8-a']) - env.Append(CPPDEFINES = ['ARM_COMPUTE_AARCH64_V8A']) + env.Append(CPPDEFINES = ['ARM_COMPUTE_AARCH64_V8A','NO_DOT_IN_TOOLCHAIN']) if env['os'] == 'linux': prefix = "aarch64-linux-gnu-" elif env['os'] == 'bare_metal': prefix = "aarch64-elf-" elif env['os'] == 'android': prefix = "aarch64-linux-android-" + env.Append(CXXFLAGS = ['-no-integrated-as']) elif env['arch'] == 'arm64-v8.2-a': env.Append(CXXFLAGS = ['-march=armv8.2-a+fp16']) # explicitly enable fp16 extension otherwise __ARM_FEATURE_FP16_VECTOR_ARITHMETIC is undefined - env.Append(CPPDEFINES = ['ARM_COMPUTE_AARCH64_V8_2']) + env.Append(CPPDEFINES = ['ARM_COMPUTE_AARCH64_V8_2','NO_DOT_IN_TOOLCHAIN']) if 'clang++' in cpp_compiler: env.Append(CXXFLAGS = ['-fno-integrated-as']) - if env['os'] == 'linux': prefix = "aarch64-linux-gnu-" elif env['os'] == 'bare_metal': diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h index 5c15e5ecc4..7ec74eaccd 100644 --- a/arm_compute/core/NEON/NEKernels.h +++ b/arm_compute/core/NEON/NEKernels.h @@ -113,13 +113,5 @@ #include "arm_compute/core/NEON/kernels/NEWarpKernel.h" #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h" #include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h" -#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h" #endif /* __ARM_COMPUTE_NEKERNELS_H__ */ diff --git a/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h b/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h deleted file mode 100644 index 4868f83d74..0000000000 --- a/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__ -#define __ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** AArch32/armv7a NEON kernel to multiply two input matrices "A" and "B". */ -class NEGEMMAArch32Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMMAArch32Kernel"; - } - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h deleted file mode 100644 index 5252378db7..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__ -#define __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ -class NEGEMMAArch64Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMMAArch64Kernel"; - } - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h deleted file mode 100644 index ba78aae9f4..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMAARCH64NATIVEKERNEL_H__ -#define __ARM_COMPUTE_NEGEMMAARCH64NATIVEKERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Native AArch64 NEON kernel to multiply two input matrices "A" and "B". */ -class NEGEMMAArch64NativeKernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMMAArch64NativeKernel"; - } - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NEGEMMAARCH64NATIVEKERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h deleted file mode 100644 index 83c209d48f..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__ -#define __ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -// Enable only if compiled for AArch64-V8A targets -#ifdef ARM_COMPUTE_AARCH64_V8A - -namespace arm_compute -{ -class ITensor; - -/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ -class NEGEMMLowpAArch64A53Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMMLowpAArch64A53Kernel"; - } - /** Default constructor */ - NEGEMMLowpAArch64A53Kernel(); - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; - -private: - using NEGEMMLowpAArch64A53 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, - const Window &window, - const ThreadInfo &info); - NEGEMMLowpAArch64A53 *_func; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_AARCH64_V8A */ -#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h deleted file mode 100644 index f813242fc9..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__ -#define __ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -// Enable only if compiled for AArch64-V8A targets -#ifdef ARM_COMPUTE_AARCH64_V8A - -namespace arm_compute -{ -class ITensor; - -/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ -class NEGEMMLowpAArch64Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMMLowpAArch64Kernel"; - } - /** Default constructor */ - NEGEMMLowpAArch64Kernel(); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; - -private: - using NEGEMMLowpAArch64 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, - bool is_transposed_1, const Window &window, - const ThreadInfo &info); - NEGEMMLowpAArch64 *_func; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_AARCH64_V8A */ -#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h deleted file mode 100644 index b854d3a9aa..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__ -#define __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -// Enable only if compiled for AArch64-V8.2-A targets -#ifdef ARM_COMPUTE_AARCH64_V8_2 - -namespace arm_compute -{ -class ITensor; - -/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ -class NEGEMMLowpAArch64V8P4Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMMLowpAArch64V8P4Kernel"; - } - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMAssemblyBaseKernel - * - * The computed function is C = a * AxB + b * C. - * - * @param[in] input0 Input tensor info containing the Matrix A. Data types supported: QASYMM8 - * @param[in] input1 Input tensor info containing the Matrix B. Data types supported: same as @p input0 - * @param[in] output Output tensor info to store the result of matrix multiplication. - * If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: S32 - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output); - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_AARCH64_V8_2 */ -#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h deleted file mode 100644 index 9fb3ce415a..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__ -#define __ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** AArch64 NEON kernel to multiply an input vector "A" and a matrix "B". */ -class NEGEMVAArch64Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEGEMVAArch64Kernel"; - } - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h deleted file mode 100644 index 75c4dbdaa4..0000000000 --- a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__ -#define __ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ -class NEHGEMMAArch64FP16Kernel : public NEGEMMAssemblyBaseKernel -{ -public: - const char *name() const override - { - return "NEHGEMMAArch64FP16Kernel"; - } - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h new file mode 100644 index 0000000000..646cc7861a --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_H__ +#define __ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Utils.h" + +namespace arm_compute +{ +class ITensor; + +/** This class is a wrapper for the assembly kernels. + * + * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55. + * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance + * of NEGEMMAssemblyWrapper and other auxiliary data structures to execute a single assembly kernel + * in the context of an NEFunctions. + * + * The type T is the type of the actual kernel implemented in assembly which is of type + * template class GemmCommon + * + * + */ +template +class NEGEMMAssemblyWrapper final : public INEKernel +{ +public: + /** Constructor + */ + NEGEMMAssemblyWrapper() : _kernel(nullptr) {} + + NEGEMMAssemblyWrapper(NEGEMMAssemblyWrapper &) = delete; + NEGEMMAssemblyWrapper(NEGEMMAssemblyWrapper &&) = default; + NEGEMMAssemblyWrapper & operator=(NEGEMMAssemblyWrapper &) = delete; + + const char *name() const override + { + return "NEGEMMAssemblyWrapper"; + } + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast(_kernel))); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + auto first = window.x().start(); + auto last = window.x().end(); + _kernel->execute(first, last, info.thread_id); + } + /** Initialise the kernel's input and output. + * + * @param[in] kernel Pointer to an assembly kernel implementation. + * @param[in] num_threads Number of concurrent threads which will execute the kernel. + */ + void configure(T *kernel) + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast(kernel))); + _kernel = kernel; + auto win_last = _kernel->get_window_size(); + Window win; + win.set(Window::DimX, Window::Dimension(0, win_last, 1)); + INEKernel::configure(win); + } +private: + T* _kernel; +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp new file mode 100644 index 0000000000..d6c9931a21 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include + +#include "arm_gemm_local.hpp" +#include "gemm_common.hpp" + +namespace arm_gemm { + +template +using UniqueGemmCommon = std::unique_ptr >; + +template +UniqueGemmCommon gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB, const Tret alpha, const Tret beta, const int maxthreads, const bool pretransposed_hint); + +} // namespace arm_gemm diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp new file mode 100644 index 0000000000..a608566634 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +/* This file is used to configure integration-specific aspects of arm_gemm, this is the gemm-linux version */ + +/* Our CPUInfo is defined in newgemm_lib.hpp */ +#include "newgemm_lib.hpp" diff --git a/arm_compute/core/NEON/kernels/assembly/asmlib.hpp b/arm_compute/core/NEON/kernels/assembly/asmlib.hpp deleted file mode 100644 index fa1d6e37a9..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/asmlib.hpp +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ -// Macro to use in assembler to get a preload. Needed because of various -// workarounds needed to get working preload behaviour. -// -// Code using these macros needs to clobber x20 and x21 as they might be -// used by the workaround. - -#define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n" -#define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n" -#define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n" -#define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n" - -#else - -#define ASM_PREFETCH(address) "PLD " address "\n" -#define ASM_PREFETCHW(address) "PLDW " address "\n" - -#endif - -/* - * Do some prefetches. - */ -template -static inline void prefetch_6x(const T *pfp) { - __asm __volatile ( - ASM_PREFETCH("[%[pfp]]") - ASM_PREFETCH("[%[pfp], #64]") - ASM_PREFETCH("[%[pfp], #128]") - ASM_PREFETCH("[%[pfp], #192]") - ASM_PREFETCH("[%[pfp], #256]") - ASM_PREFETCH("[%[pfp], #320]") - : - : [pfp] "r" (pfp) - : "memory" - ); -} - -template -static inline void prefetch_5x(const T *pfp) { - __asm __volatile ( - ASM_PREFETCH("[%[pfp]]") - ASM_PREFETCH("[%[pfp], #64]") - ASM_PREFETCH("[%[pfp], #128]") - ASM_PREFETCH("[%[pfp], #192]") - ASM_PREFETCH("[%[pfp], #256]") - : - : [pfp] "r" (pfp) - : "memory" - ); -} - -template -static inline void prefetch_4x(const T *pfp) { - __asm __volatile ( - ASM_PREFETCH("[%[pfp]]") - ASM_PREFETCH("[%[pfp], #64]") - ASM_PREFETCH("[%[pfp], #128]") - ASM_PREFETCH("[%[pfp], #192]") - : - : [pfp] "r" (pfp) - : "memory" - ); -} - -template -static inline void prefetch_3x(const T *pfp) { - __asm __volatile ( - ASM_PREFETCH("[%[pfp]]") - ASM_PREFETCH("[%[pfp], #64]") - ASM_PREFETCH("[%[pfp], #128]") - : - : [pfp] "r" (pfp) - : "memory" - ); -} - -template -static inline void prefetch_2x(const T *pfp) { - __asm __volatile ( - ASM_PREFETCH("[%[pfp]]") - ASM_PREFETCH("[%[pfp], #64]") - : - : [pfp] "r" (pfp) - : "memory" - ); -} - -template -static inline void prefetch_1x(const T *pfp) { - __asm __volatile ( - ASM_PREFETCH("[%[pfp]]") - : - : [pfp] "r" (pfp) - : "memory" - ); -} diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp index ef89e3aac3..7f47abcbb9 100644 --- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp +++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -23,11 +23,82 @@ */ #pragma once -// Abstract class for a GEMM function +#include + +namespace arm_gemm { + +// Abstract class for the GEMM/GEMV functions. +// +// GEMM implementations may be "native" (never require any input +// permutation), "pretransposed" (require permutation up-front) or require +// working space (permute as they go along). This interface should support +// all of them. + template class GemmCommon { +protected: + const To *_Aptr=nullptr; + int _lda=0; + const To *_Bptr=nullptr; + int _ldb=0; + Tr *_Cptr=nullptr; + int _ldc=0; + public: - virtual size_t get_working_size() const = 0; - virtual void execute(const To *, const int, const To *, const int, Tr *, const int, const Tr, const Tr, void *working_space) const = 0; + /* Pass in the pointers to the arrays to be operated on and their + * strides. This has a default implementation that just captures them + * all in protected members. If B is pretransposed (see below) then the + * settings for B here are ignored. */ + virtual void set_arrays(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc) { + _Aptr = A; + _lda = lda; + _Bptr = B; + _ldb = ldb; + _Cptr = C; + _ldc = ldc; + } + + /* For threading, we divide the work into some number of units and work + * out internally what unit corresponds to what work. This returns the + * total number of units. */ + virtual unsigned int get_window_size() const = 0; + + /* The maximum thread count is specified when the GEMM is created. Some + * implementations need to know how many threads will actually run in + * order to work properly. + * + * In some cases, after creating the GEMM the number of threads needs to + * be reduced (e.g. not enough work to split across threads). This + * method allows the number of actual threads to be run to be set (must + * be equal or lower). + * + * This has an empty default implementation, as GEMMs which don't care + * about thread count can safely ignore this. + */ + virtual void set_nthreads(int nthreads) { }; + + /* Actually do the work. Provide a threadid to index any per-thread + * buffers, and a start/end range to indicate which work to do. */ + virtual void execute(unsigned int start, unsigned int end, int threadid) = 0; + + /*** Working space interface (optional) ***/ + /* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */ + virtual size_t get_working_size() const { return 0; } + /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */ + virtual void set_working_space(void *) { }; + + /*** "Pretransposed" interface (optional) ***/ + /* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */ + virtual bool B_is_pretransposed() const { return false; } + /* Does pretranspose still need to be done? */ + virtual bool B_pretranspose_required() const { return false; } + /* Total number of bytes of space needed for pretransposed arrays. */ + virtual size_t get_B_pretransposed_array_size() const { return 0; } + /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ + virtual void pretranspose_B_array(void *buffer, const To *, const int) { }; + + // Destructor virtual ~GemmCommon() { } }; + +} // namespace arm_gemm diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp deleted file mode 100644 index 659ef837f5..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include -#include - -#include "gemm_common.hpp" -#include "profiler.hpp" -#include "transform.hpp" -#include "mergeresults.hpp" - -// Some macros used to decide how much working space to allocate. -// Round allocations up to the next cache line. -#define ALLOC_ROUND 64 -#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) - -// Implementation of the GemmCommon abstract class. -// -// This implementation interleaves the source matrices in blocks - good for -// larger matrices. -template -class GemmInterleaved : public GemmCommon { - typedef typename strategy::operand_type Toi; - typedef typename strategy::result_type Tri; - - const unsigned int M; - const unsigned int N; - const unsigned int K; - - const bool trA; - const bool trB; - - const strategy strat; - - unsigned int k_block = 0; - unsigned int x_block = 0; - unsigned int Mround = 0; - - size_t get_a_working_size() const { - return ROUND_UP(sizeof(Toi) * k_block * Mround); - } - - size_t get_b_working_size() const { - return ROUND_UP(sizeof(Toi) * x_block * k_block); - } - - size_t get_c_working_size() const { - return ROUND_UP(sizeof(Tri) * x_block * strat.out_height); - } - -public: - size_t get_working_size() const override { - return get_a_working_size() + get_b_working_size() + get_c_working_size(); - } - - GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB) : M(M), N(N), K(K), trA(trA), trB(trB), strat(ci) { - const unsigned int L1_size = ci->L1_size; - const unsigned int L2_size = ci->L2_size; - - // Work out blocking parameters - // k_block: Each iteration will consume (out_width + out_height) - // operands - so how many iterations will fill the L1? - k_block = L1_size / (sizeof(Toi) * (strat.out_width + strat.out_height)); - - // Needs to be a multiple of the K unroll level. - k_block /= strat.k_unroll; - k_block *= strat.k_unroll; - - // Now tune to presented problem size; this is how many blocks we need. - int num_k_blocks = (K + (k_block - 1)) / k_block; - - // So divide the space equally into that many blocks. - k_block = (K + num_k_blocks - 1) / num_k_blocks; - - // And round UP to the K unroll level required. - k_block = (k_block + strat.k_unroll - 1) / strat.k_unroll; - k_block *= strat.k_unroll; - - // x_block: Work out how many rows (of length k_block) will fit in the L2 - x_block = L2_size / (sizeof(Toi) * k_block); - - // Needs to be a multiple of the kernel output width. - x_block /= strat.out_width; - x_block *= strat.out_width; - - // And tune to the presented problem size. - int num_x_blocks = (N + (x_block - 1)) / x_block; - x_block = (N + num_x_blocks - 1) / num_x_blocks; - - x_block = (x_block + strat.out_width - 1) / strat.out_width; - x_block *= strat.out_width; - - // Work out the rounded size of M - needed for some buffers. - Mround = (M + (strat.out_height - 1)) / strat.out_height; - Mround *= strat.out_height; - - } - - // Actually execute the GEMM. - void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override { - assert(working_space); - profiler prof; - int8_t *working_space_bytes = reinterpret_cast(working_space); - intptr_t working_space_int = reinterpret_cast(working_space_bytes); - size_t diff = 0; - - if (working_space_int & 0xF) { - diff = 0x10 - (working_space_int & 0xF); - } - - Toi * const a_panel = reinterpret_cast(working_space_bytes + diff); - Toi * const b_panel = reinterpret_cast(working_space_bytes + get_a_working_size() + diff); - Tri * const c_panel = reinterpret_cast(working_space_bytes + get_a_working_size() + get_b_working_size() + diff); - - for (unsigned int k0=0; k0 K) kmax = K; - - // Figure out how many "K" the kernel will actually process. - int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll; - kern_k *= strat.k_unroll; - - prof(PROFILE_PREPA, (M * (kmax-k0) * sizeof(Toi)), [&](void) { - if (trA ^ strategy::A_transpose) { - Transform(a_panel, A, lda, 0, M, k0, kmax); - } else { - Transform(a_panel, A, lda, 0, M, k0, kmax); - } - }); - - for (unsigned int x0=0; x0 N) xmax = N; - - int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width; - - prof(PROFILE_PREPB, (xmax-x0) * (kmax-k0) * sizeof(Toi), [&](void) { - if (trB ^ strategy::B_transpose) { - Transform(b_panel, B, ldb, x0, xmax, k0, kmax); - } else { - Transform(b_panel, B, ldb, x0, xmax, k0, kmax); - } - }); - - for (unsigned int y=0; y M) ymax = M; - - prof(PROFILE_KERNEL, (strat.out_height * bblocks * strat.out_width * kern_k), [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); }); - prof(PROFILE_MERGE, (strat.out_height * bblocks * strat.out_width * sizeof(Tr)), [&](void) { MergeResults(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast(1))); }); - } - } - } - } -}; diff --git a/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp b/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp deleted file mode 100644 index 098fdaa7ac..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include - -#include "gemm_common.hpp" - -#include "profiler.hpp" -#include "transform.hpp" -#include "mergeresults.hpp" - -// Some macros used to decide how much working space to allocate. -// Round allocations up to the next cache line. -#define ALLOC_ROUND 64 -#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) - -// Implementation of the GemmCommon abstract class. -// -// This is implementation is for GEMV with a transposed matrix. -// -// By default the source data is used in-place, but if type conversion is -// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED). - -template -class GemvTransposed : public GemmCommon { - typedef typename strategy::operand_type Toi; - typedef typename strategy::result_type Tri; - - const unsigned int N; - const unsigned int K; - - const strategy strat; - - unsigned int m_block; - unsigned int n_block; - - size_t get_a_working_size() const { - return ROUND_UP(sizeof(Toi) * m_block); - } - - size_t get_b_working_size() const { - return ROUND_UP(sizeof(Toi) * m_block * n_block); - } - - size_t get_c_working_size() const { - return ROUND_UP(sizeof(Tri) * n_block); - } - -public: - size_t get_working_size() const override { - return get_a_working_size() + get_b_working_size() + get_c_working_size(); - } - - GemvTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K) : N(N), K(K), strat(ci) { - /* For now don't do any blocking. TODO: figure out if we should. */ - m_block = K; - n_block = N; - } - - // Actually execute the GEMV. - void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override { - profiler prof; - - static_assert(std::is_same::value, "gemv_transposed: Operand types must be the same."); - static_assert(std::is_same::value, "gemv_transposed: Result types must be the same."); - - for (unsigned int m0=0; m0 K) mmax = K; - - for (unsigned int n0=0; n0 N) nmax = N; - - prof(PROFILE_KERNEL, ((mmax-m0) * (nmax-n0)), [&](void) { strat.kernel(B + (m0 * ldb) + n0, A + m0, C + n0, alpha, ldb, (mmax-m0), (nmax-n0)); }); - } - } - } -}; diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp deleted file mode 100644 index d78d33c647..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __arm__ - -// Actual kernel implementations -#include "a32_sgemm_8x6/a53.hpp" -#include "a32_sgemm_8x6/a55r1.hpp" -#include "a32_sgemm_8x6/generic.hpp" - -// 8x6 SGEMM "strategy" class. -// -// This describes the characteristics of a family of kernels, in terms of -// the required interleave properties and the output block size. -// -// All kernels in the family must share these characteristics. The actual -// kernel to be used can be chosen at runtime, based on the CPU_type -// structure. -class sgemm_8x6 { -public: - typedef float operand_type; - typedef float result_type; - - typedef void (*kern_type)(const float *, const float *, float *, int, int, int); - - /* Describes the data layout for A input */ - static const int A_interleave = 6; - static const int A_block = 1; - static const int A_transpose = 0; - - /* Same for B input */ - static const int B_interleave = 8; - static const int B_block = 1; - static const int B_transpose = 1; - - /* Kernel blocking parameters */ - static const int out_width = 8; - static const int out_height = 6; - static const int k_unroll = 1; - - kern_type kernel = nullptr; - - sgemm_8x6(const CPUInfo *ci) { - switch(ci->CPU) { - case CPUTarget::A53: - kernel = a32_sgemm_8x6_a53; - break; - - case CPUTarget::A55_DOT: - kernel = a32_sgemm_8x6_a55r1; - break; - - default: - kernel = a32_sgemm_8x6; - break; - } - } -}; - -#endif // __arm__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp deleted file mode 100644 index 6bfbfc8742..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __arm__ - -#include - -#include "../../asmlib.hpp" - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 8x6), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -inline void a32_sgemm_8x6_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - printf("CIAO SONO IO, AMORE MIO!\n"); - - for (int yb=0; yb - -#include "../../asmlib.hpp" - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 8x6), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -inline void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - /* Work out starting values for "k" and "tails" in the inner loop. */ - int tails_initial = (K & 3); - if (tails_initial == 0) { - tails_initial = 4; - } - - int k_initial = ((K+3)/4) - 1; - - for (int yb=0; yb - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 8x6), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -inline void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - for (int yb=0; yb - -inline void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) -{ - const int16_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; - for (int yb = 0; yb < ablocks; yb++) - { - const int16_t *a_ptr0 = a_ptr; - const int16_t *b_ptr = Bpanel; - - for (int xb = 0; xb < bblocks; xb++) - { - a_ptr = a_ptr0; - const bool odd_k = K & 0x1; - int k = (K+1)/2 - 1; - - register int16x8_t aa asm("v0"); - register int16x8_t ab asm("v1"); - register int16x8_t b0 asm("v2"); - register int16x8_t b1 asm("v3"); - register int16x8_t b2 asm("v4"); - - __asm __volatile ( - "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower - "movi v5.4s, #0\n" - "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper - "movi v6.4s, #0\n" - "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower - "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper - "movi v7.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v8.4s, #0\n" - "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper - "movi v9.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v10.4s, #0\n" - "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower - "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper - "movi v11.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #96]") - "movi v12.4s, #0\n" - "movi v13.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #96]") - "movi v14.4s, #0\n" - "movi v15.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v16.4s, #0\n" - "movi v17.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v18.4s, #0\n" - "movi v19.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #160]") - "movi v20.4s, #0\n" - "movi v21.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #160]") - "movi v22.4s, #0\n" - "movi v23.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v24.4s, #0\n" - "add %x[a_ptr], %x[a_ptr], #0x10\n" - "movi v25.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v26.4s, #0\n" - "add %x[b_ptr], %x[b_ptr], #0x18\n" - "movi v27.4s, #0\n" - "movi v28.4s, #0\n" - - "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations. - - "1:\n" // Main loop - // First unroll - "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper - "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower - "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper - "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper - "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower - "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper - "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper - "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower - "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper - "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper - "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" - - // Second unroll - "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" - "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower - "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper - "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" - "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" - "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper - "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" - "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" - "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" - "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" - "add %x[a_ptr], %x[a_ptr], #0x20\n" - "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" - "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "smlal v14.4s, %[b2].4h, %[ab].h[1]\n" - "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" - ASM_PREFETCH("[%[a_ptr], #320]") - "smlal v16.4s, %[b2].4h, %[ab].h[3]\n" - "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" - ASM_PREFETCH("[%[b_ptr], #448]") - "smlal v18.4s, %[b2].4h, %[ab].h[5]\n" - "smlal v19.4s, %[b2].4h, %[ab].h[6]\n" - "smlal v20.4s, %[b2].4h, %[ab].h[7]\n" - "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" - "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" - "subs %x[k], %x[k], #0x1\n" - "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" - "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" - "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower - "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper - "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" - "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" - "add %x[b_ptr], %x[b_ptr], #0x30\n" - "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" - "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" - "bne 1b\n" - - "2:\n" // Even tail - "cbnz %x[odd_k], 3f\n" - - "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper - "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower - "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper - "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper - "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower - "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper - "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper - "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "add %[a_ptr], %[a_ptr], #0x10\n" - "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "add %[b_ptr], %[b_ptr], #0x18\n" - "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper - "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" - - "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" - "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" - "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" - "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" - "smlal v14.4s, %[b2].4h, %[ab].h[1]\n" - "str q5, [%x[c_ptr]]\n" - "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" - "str q13, [%x[c_ptr], #0x10]\n" - "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" - "str q21, [%x[c_ptr], #0x20]\n" - "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" - "str q6, [%x[c_ptr], #0x30]\n" - "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" - "str q14, [%x[c_ptr], #0x40]\n" - "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" - "str q22, [%x[c_ptr], #0x50]\n" - "smlal v16.4s, %[b2].4h, %[ab].h[3]\n" - "str q7, [%x[c_ptr], #0x60]\n" - "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" - "str q15, [%x[c_ptr], #0x70]\n" - "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" - "str q23, [%x[c_ptr], #0x80]\n" - "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" - "str q8, [%x[c_ptr], #0x90]\n" - "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" - "str q16, [%x[c_ptr], #0xa0]\n" - "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" - "str q24, [%x[c_ptr], #0xb0]\n" - "smlal v18.4s, %[b2].4h, %[ab].h[5]\n" - "str q9, [%x[c_ptr], #0xc0]\n" - "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" - "str q17, [%x[c_ptr], #0xd0]\n" - "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" - "str q25, [%x[c_ptr], #0xe0]\n" - "smlal v19.4s, %[b2].4h, %[ab].h[6]\n" - "str q10, [%x[c_ptr], #0xf0]\n" - "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" - "str q18, [%x[c_ptr], #0x100]\n" - "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" - "str q26, [%x[c_ptr], #0x110]\n" - "smlal v20.4s, %[b2].4h, %[ab].h[7]\n" - "str q11, [%x[c_ptr], #0x120]\n" - "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" - "str q19, [%x[c_ptr], #0x130]\n" - "b 4f\n" // Complete write out - - "3:\n" // Odd tail - "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "str q5, [%x[c_ptr]]\n" - "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "str q13, [%x[c_ptr], #0x10]\n" - "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "str q21, [%x[c_ptr], #0x20]\n" - "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "str q6, [%x[c_ptr], #0x30]\n" - "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "str q14, [%x[c_ptr], #0x40]\n" - "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "str q22, [%x[c_ptr], #0x50]\n" - "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "str q7, [%x[c_ptr], #0x60]\n" - "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "str q15, [%x[c_ptr], #0x70]\n" - "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "str q23, [%x[c_ptr], #0x80]\n" - "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "str q8, [%x[c_ptr], #0x90]\n" - "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "str q16, [%x[c_ptr], #0xa0]\n" - "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "str q24, [%x[c_ptr], #0xb0]\n" - "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "str q9, [%x[c_ptr], #0xc0]\n" - "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "str q17, [%x[c_ptr], #0xd0]\n" - "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "str q25, [%x[c_ptr], #0xe0]\n" - "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "str q10, [%x[c_ptr], #0xf0]\n" - "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "str q18, [%x[c_ptr], #0x100]\n" - "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "str q26, [%x[c_ptr], #0x110]\n" - "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" - "str q11, [%x[c_ptr], #0x120]\n" - - "4:\n" // End of function - "str q19, [%x[c_ptr], #0x130]\n" - "str q27, [%x[c_ptr], #0x140]\n" - "str q12, [%x[c_ptr], #0x150]\n" - "str q20, [%x[c_ptr], #0x160]\n" - "str q28, [%x[c_ptr], #0x170]\n" - "add %x[c_ptr], %x[c_ptr], #0x180\n" - : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), - [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2) - : [odd_k] "r" (odd_k) - : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc" - ); - } - } -} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp deleted file mode 100644 index 88cbb361b3..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -// Load the actual kernel -#include "a64_gemm_s8_12x8/generic.hpp" - -class gemm_s8_12x8 { -public: - typedef int8_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); - - /* Describes the data layout for A input */ - static const int A_interleave = 8; - static const int A_block = 4; - static const bool A_transpose = false; - - /* Same for B input */ - static const int B_interleave = 12; - static const int B_block = 4; - static const bool B_transpose = true; - - /* Kernel blocking parameters */ - static const int out_width = 12; - static const int out_height = 8; - static const int k_unroll = 4; - - kern_type kernel = nullptr; - - gemm_s8_12x8(const CPUInfo *ci) { - kernel = a64_gemm_s8_12x8; - } -}; - -#endif // __aarch64__ - diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp deleted file mode 100644 index 4ac2ba4234..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include -#include "dot_toolchain_support.h" -#include - -void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - assert(Apanel); - assert(Bpanel); - assert(Cpanel); - const int8_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; - // We divide K by 4 because the sdot instruction processes 4 elements at a time. - const int W = K/4; - // Fix up for odd lengths - set a flag if K is odd, but make. - // sure we round up the iteration count. - const int oddk = (W & 1); - const int init_value_k = ((W+1)/2) - 1; - for (int yb=0; yb -#include "dot_toolchain_support.h" -#include - - -inline void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - assert(Apanel); - assert(Bpanel); - assert(Cpanel); - K/=4; - const long int row_jump=0; - const long int block_jump=0; - const int32_t *a_ptr = reinterpret_cast(Apanel); - int32_t *c_ptr = reinterpret_cast(Cpanel); - for (int yb=0; yb(Bpanel); - for (int xb=0; xb - -inline void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { - const int8_t *a_ptr = Apanel; - int32_t *c_ptr = Cpanel; - K /= 16; - int oddk = (K & 1); - - for (int yb=0; yb - -inline void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) -{ - const uint16_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - - for (int yb = 0; yb < ablocks; yb++) - { - const uint16_t *a_ptr0 = a_ptr; - const uint16_t *b_ptr = Bpanel; - - for (int xb = 0; xb < bblocks; xb++) - { - a_ptr = a_ptr0; - const bool odd_k = K & 0x1; - int k = (K+1)/2 - 1; - - register uint16x8_t aa asm("v0"); - register uint16x8_t ab asm("v1"); - register uint16x8_t b0 asm("v2"); - register uint16x8_t b1 asm("v3"); - register uint16x8_t b2 asm("v4"); - - __asm __volatile ( - "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower - "movi v5.4s, #0\n" - "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper - "movi v6.4s, #0\n" - "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower - "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper - "movi v7.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #64]") - "movi v8.4s, #0\n" - "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper - "movi v9.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #64]") - "movi v10.4s, #0\n" - "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower - "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper - "movi v11.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #96]") - "movi v12.4s, #0\n" - "movi v13.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #96]") - "movi v14.4s, #0\n" - "movi v15.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #128]") - "movi v16.4s, #0\n" - "movi v17.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #128]") - "movi v18.4s, #0\n" - "movi v19.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #160]") - "movi v20.4s, #0\n" - "movi v21.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #160]") - "movi v22.4s, #0\n" - "movi v23.4s, #0\n" - ASM_PREFETCH("[%[a_ptr], #192]") - "movi v24.4s, #0\n" - "add %x[a_ptr], %x[a_ptr], #0x10\n" - "movi v25.4s, #0\n" - ASM_PREFETCH("[%[b_ptr], #192]") - "movi v26.4s, #0\n" - "add %x[b_ptr], %x[b_ptr], #0x18\n" - "movi v27.4s, #0\n" - "movi v28.4s, #0\n" - - "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations. - - "1:\n" // Main loop - // First unroll - "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper - "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower - "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper - "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper - "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower - "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper - "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper - "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower - "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper - "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper - "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" - - // Second unroll - "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" - "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower - "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper - "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" - "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" - "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper - "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" - "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" - "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" - "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" - "add %x[a_ptr], %x[a_ptr], #0x20\n" - "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" - "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" - ASM_PREFETCH("[%[b_ptr], #320]") - "umlal v14.4s, %[b2].4h, %[ab].h[1]\n" - "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" - ASM_PREFETCH("[%[a_ptr], #320]") - "umlal v16.4s, %[b2].4h, %[ab].h[3]\n" - "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" - ASM_PREFETCH("[%[b_ptr], #448]") - "umlal v18.4s, %[b2].4h, %[ab].h[5]\n" - "umlal v19.4s, %[b2].4h, %[ab].h[6]\n" - "umlal v20.4s, %[b2].4h, %[ab].h[7]\n" - "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" - "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" - "subs %x[k], %x[k], #0x1\n" - "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" - "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" - "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower - "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper - "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" - "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" - "add %x[b_ptr], %x[b_ptr], #0x30\n" - "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" - "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" - "bne 1b\n" - - "2:\n" // Even tail - "cbnz %x[odd_k], 3f\n" - - "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper - "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower - "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper - "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper - "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower - "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper - "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper - "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "add %[a_ptr], %[a_ptr], #0x10\n" - "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "add %[b_ptr], %[b_ptr], #0x18\n" - "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper - "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" - - "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" - "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" - "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" - "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" - "umlal v14.4s, %[b2].4h, %[ab].h[1]\n" - "str q5, [%x[c_ptr]]\n" - "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" - "str q13, [%x[c_ptr], #0x10]\n" - "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" - "str q21, [%x[c_ptr], #0x20]\n" - "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" - "str q6, [%x[c_ptr], #0x30]\n" - "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" - "str q14, [%x[c_ptr], #0x40]\n" - "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" - "str q22, [%x[c_ptr], #0x50]\n" - "umlal v16.4s, %[b2].4h, %[ab].h[3]\n" - "str q7, [%x[c_ptr], #0x60]\n" - "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" - "str q15, [%x[c_ptr], #0x70]\n" - "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" - "str q23, [%x[c_ptr], #0x80]\n" - "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" - "str q8, [%x[c_ptr], #0x90]\n" - "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" - "str q16, [%x[c_ptr], #0xa0]\n" - "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" - "str q24, [%x[c_ptr], #0xb0]\n" - "umlal v18.4s, %[b2].4h, %[ab].h[5]\n" - "str q9, [%x[c_ptr], #0xc0]\n" - "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" - "str q17, [%x[c_ptr], #0xd0]\n" - "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" - "str q25, [%x[c_ptr], #0xe0]\n" - "umlal v19.4s, %[b2].4h, %[ab].h[6]\n" - "str q10, [%x[c_ptr], #0xf0]\n" - "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" - "str q18, [%x[c_ptr], #0x100]\n" - "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" - "str q26, [%x[c_ptr], #0x110]\n" - "umlal v20.4s, %[b2].4h, %[ab].h[7]\n" - "str q11, [%x[c_ptr], #0x120]\n" - "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" - "str q19, [%x[c_ptr], #0x130]\n" - "b 4f\n" // Complete write out - - "3:\n" // Odd tail - "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" - "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" - "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" - "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" - "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" - "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" - "str q5, [%x[c_ptr]]\n" - "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" - "str q13, [%x[c_ptr], #0x10]\n" - "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" - "str q21, [%x[c_ptr], #0x20]\n" - "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" - "str q6, [%x[c_ptr], #0x30]\n" - "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" - "str q14, [%x[c_ptr], #0x40]\n" - "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" - "str q22, [%x[c_ptr], #0x50]\n" - "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" - "str q7, [%x[c_ptr], #0x60]\n" - "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" - "str q15, [%x[c_ptr], #0x70]\n" - "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" - "str q23, [%x[c_ptr], #0x80]\n" - "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" - "str q8, [%x[c_ptr], #0x90]\n" - "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" - "str q16, [%x[c_ptr], #0xa0]\n" - "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" - "str q24, [%x[c_ptr], #0xb0]\n" - "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" - "str q9, [%x[c_ptr], #0xc0]\n" - "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" - "str q17, [%x[c_ptr], #0xd0]\n" - "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" - "str q25, [%x[c_ptr], #0xe0]\n" - "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" - "str q10, [%x[c_ptr], #0xf0]\n" - "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" - "str q18, [%x[c_ptr], #0x100]\n" - "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" - "str q26, [%x[c_ptr], #0x110]\n" - "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" - "str q11, [%x[c_ptr], #0x120]\n" - - "4:\n" // End of function - "str q19, [%x[c_ptr], #0x130]\n" - "str q27, [%x[c_ptr], #0x140]\n" - "str q12, [%x[c_ptr], #0x150]\n" - "str q20, [%x[c_ptr], #0x160]\n" - "str q28, [%x[c_ptr], #0x170]\n" - "add %x[c_ptr], %x[c_ptr], #0x180\n" - : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), - [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2) - : [odd_k] "r" (odd_k) - : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc" - ); - } - } -} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp deleted file mode 100644 index 62cd747d7c..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -// Load the actual kernel -#include "a64_gemm_u8_12x8/generic.hpp" -#include "a64_gemm_u8_12x8/a55r1.hpp" - -class gemm_u8_12x8 { -public: - typedef uint8_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); - - /* Describes the data layout for A input */ - static const int A_interleave = 8; - static const int A_block = 4; - static const bool A_transpose = false; - - /* Same for B input */ - static const int B_interleave = 12; - static const int B_block = 4; - static const bool B_transpose = true; - - /* Kernel blocking parameters */ - static const int out_width = 12; - static const int out_height = 8; - static const int k_unroll = 4; - - kern_type kernel = nullptr; - - gemm_u8_12x8(const CPUInfo *ci) { - kernel = a64_gemm_u8_12x8; - if (ci->CPU == CPUTarget::A55_DOT) { - kernel = a64_gemm_u8_12x8_a55r1; - } - } -}; - -#endif // __aarch64__ - diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp deleted file mode 100644 index c7c2acbb49..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp +++ /dev/null @@ -1,396 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include -#include "dot_toolchain_support.h" -#include - -inline void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - assert(Apanel); - assert(Bpanel); - assert(Cpanel); - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - // We divide K by 4 because the udot instruction processes 4 elements at a time. - const int W = K/4; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - const int oddk = (W & 1); - const int init_value_k = ((W+1)/2) - 1; - for (int yb=0; yb -#include "dot_toolchain_support.h" -#include - -inline void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - assert(Apanel); - assert(Bpanel); - assert(Cpanel); - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - // We divide K by 4 because the udot instruction processes 4 elements at a time. - const int W = K/4; - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - const int oddk = (W & 1); - const int init_value_k = ((W+1)/2) - 1; - for (int yb=0; yb - -inline void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { - const uint8_t *a_ptr = Apanel; - uint32_t *c_ptr = Cpanel; - K /= 16; - - for (int yb=0; ybCPU == CPUTarget::A55_DOT) { - kernel = a64_hgemm_asimd_24x8_a55r1; - } - } - -}; - -#endif // __aarch64__ and FP16_VECTOR_ARITHMETIC diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp deleted file mode 100644 index 1789abb046..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp +++ /dev/null @@ -1,384 +0,0 @@ -/* - * Copyright (c) 201 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 12x8), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -inline void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { - const __fp16 *a_ptr = Apanel; - __fp16 *c_ptr = Cpanel; - - // Fix up for odd lengths - set a flag if K is odd, but make - // sure we round up the iteration count. - int oddk = (K & 1); - int k_iters = ((K+1)/2) - 1; - - for (int yb=0; yb - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 12x8), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -inline void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { - const __fp16 *a_ptr = Apanel; - __fp16 *c_ptr = Cpanel; - for (int yb=0; ybCPU == CPUTarget::A53) { - kernel = a64_sgemm_asimd_12x8_a53; - } - else if (ci->CPU == CPUTarget::A55) { - kernel = a64_sgemm_asimd_12x8_a55; - } - else if (ci->CPU == CPUTarget::A55_DOT) { - kernel = a64_sgemm_asimd_12x8_a55r1; - } - } -}; - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp deleted file mode 100644 index 1c9b4b38fc..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp +++ /dev/null @@ -1,368 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -inline void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - for (int yb=0; yb - -// Kernel implementation. -// -// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. -// Assume that "Cpanel" points to a chunk of C output blocks (each size -// 12x8), the chunks being arranged in a row major fashion. -// -// Note that the intent of this is that either ablocks or bblocks will be 1 -// - this construction allows the output loop to proceed in either order. - -inline void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump=0, long int block_jump=0) { - const float *a_ptr = Apanel; - float *c_ptr = Cpanel; - - for (int yb=0; yb -#include "asmlib.hpp" - -// Kernel implementation - transposed GEMV -// -// The kernel will process "M" rows of A (= steps of dot product) and "N" -// columns (= dot products total) -// -// General plan is to do as many columns simultaneously as possible - a -// reasonable limit is half the NEON regfile = 64 total accumulators. -// -// It's possible that messing around with sub-blocking M and N can yield -// higher performance, but that's left to the outer loop. In this kernel we -// process all of M at the same time. - - -// How far ahead to prefetch for the first and subsequent prefetches. -// These values work for A72 on JunoR2... - -#define FIRST_PFD 9 -#define PFD 6 - -inline void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float alpha, int lda, int M, int N) { - const float *a_ptr_base = Astart; - float *y_ptr = Ystart; - - register const float32x4_t va asm("v1") = vdupq_n_f32(alpha); - - int firstpfd=FIRST_PFD; - if (firstpfd > M) { - firstpfd = (M-1); - } - - int pfd = PFD; - if (pfd > M) { - pfd = (M-1); - } - - ptrdiff_t jump = lda * sizeof(int); - - for (;N>=96;N-=96) { - int k = M-1; - - const float *a_ptr = a_ptr_base; - const float *x_ptr = Xstart; - const float *pf_ptr = a_ptr; - const float *firstpf_ptr = a_ptr; - const float *pf_limit = a_ptr + (M * lda); - - for (int i=0; i0) { - // Handle N tail - up to 95 stragglers. - // This is 0-23 vectors, plus optionally an 64-bit vector and/or a - // single value for the remainder. - - // Independent pointers into the matrix for the odd 2 and odd 1. - // Double up as flag to indicate whether they are needed. - const float *odd2_aptr=NULL; - const float *odd1_aptr=NULL; - - // Figure out how much work we need to do. - int numvecs = N/4; - int rem = N%4; - int k=M; - - // Set up pointers for the odd 2/1 if needed. - if (rem >= 2) { - odd2_aptr = a_ptr_base + (numvecs * 4); - } - - if (rem & 1) { - odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr==NULL ? 0 : 2); - } - - const float *a_ptr = a_ptr_base; - const float *firstpf_ptr = a_ptr_base; - const float *pf_ptr = a_ptr_base; - const float *pf_limit = a_ptr + (M * lda); - - const float *x_ptr = Xstart; - int vecs=0; // Working variable to count how many vectors to work on. - int dopf=1; // Track whether we are doing prefetches. - - // Figure out how many cache lines we need to prefetch each time. - int numpfs = (N + 15) / 16; - - // Do initial prefetches - for (int i=0; i 1) { - for (int i=0; i -void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta) { - int full_y_blocks = (ymax - y0) / height; - int y_remainder = (ymax - y0) % height; - int y_blocks = full_y_blocks + (y_remainder ? 1 : 0); - - int full_x_blocks = (xmax - x0) / width; - int x_remainder = (xmax - x0) % width; - int x_blocks = full_x_blocks + (x_remainder ? 1 : 0); - - for (int y_block = 0; y_block < y_blocks; y_block++) { - int ybase = y0 + (y_block * height); - - int fill_rows = (y_block < full_y_blocks) ? height : y_remainder; - - for (int x_block = 0; x_block < x_blocks; x_block++) { - int xbase = x0 + (x_block * width); - - int fill_cols = (x_block < full_x_blocks) ? width : x_remainder; - - for (int row=0; row < fill_rows; row++) { - for (int col=0; col < fill_cols; col++) { - Tout &p = out[(ybase + row) * ldc + xbase + col]; - - p = (p * alpha) + (beta * in[row * width + col]); - } - } - - in += (width * height); - } - } -} - -#include "merges/list.hpp" diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp deleted file mode 100644 index ddd67e8ee2..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __arm__ - -#include "../asmlib.hpp" - -#include - -template<> -inline void MergeResults<8, 6>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) { - const float *inptr = in; -// prefetch_6x(inptr); -// prefetch_6x(inptr + 96); - - float32x4_t av = vdupq_n_f32(alpha); - float32x4_t bv = vdupq_n_f32(beta); - - for (int y=y0; y= ymax) { - switch ((y + 5) - ymax) { - case 4: - outptr1 = dummyres; - case 3: - outptr2 = dummyres; - case 2: - outptr3 = dummyres; - case 1: - outptr4 = dummyres; - case 0: - outptr5 = dummyres; - default: - break; - } - } - - /* For ragged X, manually copy over the valid results. */ - if ((i+7) >= xmax) { - for (int xi=0; xi<8; xi++) { - if ((i+xi) < xmax) { - *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); - outptr0++; - *outptr1 = (alpha * inptr[xi + 8]) + (*outptr1 * beta); - outptr1++; - *outptr2 = (alpha * inptr[xi + 16]) + (*outptr2 * beta); - outptr2++; - *outptr3 = (alpha * inptr[xi + 24]) + (*outptr3 * beta); - outptr3++; - *outptr4 = (alpha * inptr[xi + 32]) + (*outptr4 * beta); - outptr4++; - *outptr5 = (alpha * inptr[xi + 40]) + (*outptr5 * beta); - outptr5++; - } - } - inptr += 48; - } else { - /* Optimized routine to copy an entire block */ - __asm __volatile ( - // Rows 0-1 - "VLD1.32 {d8-d11}, [%[outptr0]]\n" - "VMUL.f32 q4, q4, %q[bv]\n" - "VLD1.32 {d12-d15}, [%[outptr1]]\n" - "VMUL.f32 q5, q5, %q[bv]\n" - "VLD1.32 {d0-d3}, [%[inptr]]!\n" - "VMUL.f32 q6, q6, %q[bv]\n" - "VLD1.32 {d4-d7}, [%[inptr]]!\n" - "VMUL.f32 q7, q7, %q[bv]\n" - - "VMLA.f32 q4, q0, %q[av]\n" - ASM_PREFETCH("[%[inptr], #352]") - "VMLA.f32 q5, q1, %q[av]\n" - "VST1.32 {d8-d11}, [%[outptr0]]!\n" - ASM_PREFETCH("[%[inptr], #416]") - "VMLA.f32 q6, q2, %q[av]\n" - ASM_PREFETCH("[%[inptr], #480]") - "VMLA.f32 q7, q3, %q[av]\n" - "VST1.32 {d12-d15}, [%[outptr1]]!\n" - - // Rows 2-3 - "VLD1.32 {d8-d11}, [%[outptr2]]\n" - "VMUL.f32 q4, q4, %q[bv]\n" - "VLD1.32 {d12-d15}, [%[outptr3]]\n" - "VMUL.f32 q5, q5, %q[bv]\n" - "VLD1.32 {d0-d3}, [%[inptr]]!\n" - "VMUL.f32 q6, q6, %q[bv]\n" - "VLD1.32 {d4-d7}, [%[inptr]]!\n" - "VMUL.f32 q7, q7, %q[bv]\n" - - "VMLA.f32 q4, q0, %q[av]\n" - ASM_PREFETCH("[%[outptr0], #96]") - "VMLA.f32 q5, q1, %q[av]\n" - "VST1.32 {d8-d11}, [%[outptr2]]!\n" - ASM_PREFETCH("[%[outptr1], #96]") - "VMLA.f32 q6, q2, %q[av]\n" - ASM_PREFETCH("[%[outptr2], #96]") - "VMLA.f32 q7, q3, %q[av]\n" - "VST1.32 {d12-d15}, [%[outptr3]]!\n" - - // Rows 4-5 - "VLD1.32 {d8-d11}, [%[outptr4]]\n" - "VMUL.f32 q4, q4, %q[bv]\n" - "VLD1.32 {d12-d15}, [%[outptr5]]\n" - "VMUL.f32 q5, q5, %q[bv]\n" - "VLD1.32 {d0-d3}, [%[inptr]]!\n" - "VMUL.f32 q6, q6, %q[bv]\n" - "VLD1.32 {d4-d7}, [%[inptr]]!\n" - "VMUL.f32 q7, q7, %q[bv]\n" - - "VMLA.f32 q4, q0, %q[av]\n" - ASM_PREFETCH("[%[outptr3], #96]") - "VMLA.f32 q5, q1, %q[av]\n" - "VST1.32 {d8-d11}, [%[outptr4]]!\n" - ASM_PREFETCH("[%[outptr4], #96]") - "VMLA.f32 q6, q2, %q[av]\n" - ASM_PREFETCH("[%[outptr5], #128]") - "VMLA.f32 q7, q3, %q[av]\n" - "VST1.32 {d12-d15}, [%[outptr5]]!\n" - : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), - [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [inptr] "+r" (inptr) - : [av] "w" (av), [bv] "w" (bv) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" - ); - } - } - } -} - -#endif // __arm__ diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp deleted file mode 100644 index e8edddb4f4..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "../asmlib.hpp" - -template<> -inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) { - const float *inptr = in; - prefetch_6x(inptr); - prefetch_6x(inptr + 96); - - float32x4_t av = vdupq_n_f32(alpha); - float32x4_t bv = vdupq_n_f32(beta); - - for (int y=y0; y= ymax) { - switch ((y + 7) - ymax) { - case 6: - outptr1 = dummyres; - case 5: - outptr2 = dummyres; - case 4: - outptr3 = dummyres; - case 3: - outptr4 = dummyres; - case 2: - outptr5 = dummyres; - case 1: - outptr6 = dummyres; - case 0: - outptr7 = dummyres; - default: - break; - } - } - - /* For ragged X, manually copy over the valid results. */ - if ((i+11) >= xmax) { - for (int xi=0; xi<12; xi++) { - if ((i+xi) < xmax) { - *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); - outptr0++; - *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); - outptr1++; - *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta); - outptr2++; - *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta); - outptr3++; - *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta); - outptr4++; - *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta); - outptr5++; - *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta); - outptr6++; - *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta); - outptr7++; - } - } - inptr += 96; - } else { - /* Optimized routine to copy an entire block */ - __asm __volatile ( - // Rows 0-1 - "LDP q16, q17, [%[outptr0]]\n" - "FMUL v16.4s, v16.4s, %[bv].4s\n" - "LDR q18, [%[outptr0], #32]\n" - "FMUL v17.4s, v17.4s, %[bv].4s\n" - "LDP q19, q20, [%[outptr1]]\n" - "FMUL v18.4s, v18.4s, %[bv].4s\n" - "LDR q21, [%[outptr1], #32]\n" - ASM_PREFETCH("[%[inptr], #768]") - "FMUL v19.4s, v19.4s, %[bv].4s\n" - "LDP q0, q1, [%[inptr]]\n" - "FMUL v20.4s, v20.4s, %[bv].4s\n" - "LDP q2, q3, [%[inptr], #32]\n" - "FMUL v21.4s, v21.4s, %[bv].4s\n" - "LDP q4, q5, [%[inptr], #64]\n" - "FMLA v16.4s, v0.4s, %[av].4s\n" - ASM_PREFETCH("[%[inptr], #832]") - "FMLA v17.4s, v1.4s, %[av].4s\n" - "STP q16, q17, [%[outptr0]], #32\n" - "FMLA v18.4s, v2.4s, %[av].4s\n" - "STR q18, [%[outptr0]], #16\n" - "FMLA v19.4s, v3.4s, %[av].4s\n" - ASM_PREFETCH("[%[inptr], #896]") - "FMLA v20.4s, v4.4s, %[av].4s\n" - "STP q19, q20, [%[outptr1]], #32\n" - "FMLA v21.4s, v5.4s, %[av].4s\n" - "STR q21, [%[outptr1]], #16\n" - - // Rows 2-3 - "LDP q16, q17, [%[outptr2]]\n" - "FMUL v16.4s, v16.4s, %[bv].4s\n" - "LDR q18, [%[outptr2], #32]\n" - "FMUL v17.4s, v17.4s, %[bv].4s\n" - "LDP q19, q20, [%[outptr3]]\n" - "FMUL v18.4s, v18.4s, %[bv].4s\n" - "LDR q21, [%[outptr3], #32]\n" - ASM_PREFETCH("[%[inptr], #960]") - "FMUL v19.4s, v19.4s, %[bv].4s\n" - "LDP q0, q1, [%[inptr], #96]\n" - "FMUL v20.4s, v20.4s, %[bv].4s\n" - "LDP q2, q3, [%[inptr], #128]\n" - "FMUL v21.4s, v21.4s, %[bv].4s\n" - "LDP q4, q5, [%[inptr], #160]\n" - "FMLA v16.4s, v0.4s, %[av].4s\n" - ASM_PREFETCH("[%[inptr], #1024]") - "FMLA v17.4s, v1.4s, %[av].4s\n" - "STP q16, q17, [%[outptr2]], #32\n" - "FMLA v18.4s, v2.4s, %[av].4s\n" - "STR q18, [%[outptr2]], #16\n" - "FMLA v19.4s, v3.4s, %[av].4s\n" - ASM_PREFETCH("[%[inptr], #1088]") - "FMLA v20.4s, v4.4s, %[av].4s\n" - "STP q19, q20, [%[outptr3]], #32\n" - "FMLA v21.4s, v5.4s, %[av].4s\n" - "STR q21, [%[outptr3]], #16\n" - - // Rows 4-5 - ASM_PREFETCH("[%[outptr0], #80]") - "LDP q16, q17, [%[outptr4]]\n" - "FMUL v16.4s, v16.4s, %[bv].4s\n" - "LDR q18, [%[outptr4], #32]\n" - "FMUL v17.4s, v17.4s, %[bv].4s\n" - "LDP q19, q20, [%[outptr5]]\n" - "FMUL v18.4s, v18.4s, %[bv].4s\n" - "LDR q21, [%[outptr5], #32]\n" - ASM_PREFETCH("[%[outptr1], #80]") - "FMUL v19.4s, v19.4s, %[bv].4s\n" - "LDP q0, q1, [%[inptr], #192]\n" - "FMUL v20.4s, v20.4s, %[bv].4s\n" - "LDP q2, q3, [%[inptr], #224]\n" - "FMUL v21.4s, v21.4s, %[bv].4s\n" - "LDP q4, q5, [%[inptr], #256]\n" - "FMLA v16.4s, v0.4s, %[av].4s\n" - ASM_PREFETCH("[%[outptr2], #80]") - "FMLA v17.4s, v1.4s, %[av].4s\n" - "STP q16, q17, [%[outptr4]], #32\n" - "FMLA v18.4s, v2.4s, %[av].4s\n" - "STR q18, [%[outptr4]], #16\n" - "FMLA v19.4s, v3.4s, %[av].4s\n" - ASM_PREFETCH("[%[outptr3], #80]") - "FMLA v20.4s, v4.4s, %[av].4s\n" - "STP q19, q20, [%[outptr5]], #32\n" - "FMLA v21.4s, v5.4s, %[av].4s\n" - "STR q21, [%[outptr5]], #16\n" - - // Rows 6-7 - ASM_PREFETCH("[%[outptr4], #80]") - "LDP q16, q17, [%[outptr6]]\n" - "FMUL v16.4s, v16.4s, %[bv].4s\n" - "LDR q18, [%[outptr6], #32]\n" - "FMUL v17.4s, v17.4s, %[bv].4s\n" - "LDP q19, q20, [%[outptr7]]\n" - "FMUL v18.4s, v18.4s, %[bv].4s\n" - "LDR q21, [%[outptr7], #32]\n" - ASM_PREFETCH("[%[outptr5], #80]") - "FMUL v19.4s, v19.4s, %[bv].4s\n" - "LDP q0, q1, [%[inptr], #288]\n" - "FMUL v20.4s, v20.4s, %[bv].4s\n" - "LDP q2, q3, [%[inptr], #320]\n" - "FMUL v21.4s, v21.4s, %[bv].4s\n" - "LDP q4, q5, [%[inptr], #352]\n" - "FMLA v16.4s, v0.4s, %[av].4s\n" - ASM_PREFETCH("[%[outptr6], #128]") - "FMLA v17.4s, v1.4s, %[av].4s\n" - "STP q16, q17, [%[outptr6]], #32\n" - "FMLA v18.4s, v2.4s, %[av].4s\n" - "STR q18, [%[outptr6]], #16\n" - "FMLA v19.4s, v3.4s, %[av].4s\n" - ASM_PREFETCH("[%[outptr7], #128]") - "FMLA v20.4s, v4.4s, %[av].4s\n" - "STP q19, q20, [%[outptr7]], #32\n" - "FMLA v21.4s, v5.4s, %[av].4s\n" - "STR q21, [%[outptr7]], #16\n" - "ADD %[inptr], %[inptr], #384\n" - : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), - [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), - [inptr] "+r" (inptr) - : [av] "w" (av), [bv] "w" (bv) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21" - ); - } - } - } -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp b/arm_compute/core/NEON/kernels/assembly/merges/list.hpp deleted file mode 100644 index 29b915a75d..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "a32_merge_float_8x6.hpp" -#include "a64_merge_float_12x8.hpp" -//#include "a64_merge_float_to_half_12x8.hpp" -//#include "a64_merge_half_24x8.hpp" -//#include "a64_merge_int32_12x8.hpp" diff --git a/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp b/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp new file mode 100644 index 0000000000..b7cc3d773b --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +extern int l1_cache_size; +extern int l2_cache_size; +extern int force_cpu; + +#ifdef __ANDROID__ +inline unsigned long stoul( const std::string& str, std::size_t* pos = 0, int base = 10 ) +{ + char *end; + const unsigned long ret = strtoul( str.c_str(), &end, base); + *pos = end - str.c_str(); + return ret; +} +inline int stoi( const std::string& str, std::size_t* pos = 0, int base = 10 ) +{ + return atoi(str.c_str()); +} +#endif + + +#ifndef BARE_METAL +#include + +/* Get HWCAP bits from asm/hwcap.h */ +#include +#endif /* !BARE_METAL */ + +/* Make sure the bits we care about are defined, just in case asm/hwcap.h is + * out of date (or for bare metal mode) */ +#ifndef HWCAP_ASIMDHP +#define HWCAP_ASIMDHP (1 << 10) +#endif + +#ifndef HWCAP_CPUID +#define HWCAP_CPUID (1 << 11) +#endif + +#ifndef HWCAP_ASIMDDP +#define HWCAP_ASIMDDP (1 << 20) +#endif + +#define CPUINFO_HACK + +//unsigned int get_cpu_impl(); + + +/* CPU models - we only need to detect CPUs we have + * microarchitecture-specific code for. + * + * Architecture features are detected via HWCAPs. + */ +enum class CPUModel { + GENERIC = 0x0001, + A53 = 0x0010, + A55r0 = 0x0011, + A55r1 = 0x0012, +}; + +class CPUInfo +{ +private: + struct PerCPUData { + CPUModel model = CPUModel::GENERIC; + uint32_t midr = 0; + bool model_set = false; + }; + + std::vector _percpu={}; + + bool _cpuid = false; + bool _fp16 = false; + bool _dotprod = false; + + unsigned int L1_cache_size = 32768; + unsigned int L2_cache_size = 262144; + + /* Convert an MIDR register value to a CPUModel enum value. */ + CPUModel midr_to_model(const unsigned int midr) const { + CPUModel model; + + // Unpack variant and CPU ID + int variant = (midr >> 20) & 0xF; + int cpunum = (midr >> 4) & 0xFFF; + + /* Only CPUs we have code paths for are detected. All other CPUs + * can be safely classed as "GENERIC" + */ + + switch(cpunum) { + case 0xd03: + model = CPUModel::A53; + break; + + case 0xd05: + if (variant) { + model = CPUModel::A55r1; + } else { + model = CPUModel::A55r0; + } + break; + + default: + model = CPUModel::GENERIC; + break; + } + + return model; + } + + /* If the CPUID capability is present, MIDR information is provided in + /sys. Use that to populate the CPU model table. */ + void populate_models_cpuid() { + for (unsigned long int i=0; i<_percpu.size(); i++) { + std::stringstream str; + str << "/sys/devices/system/cpu/cpu" << i << "/regs/identification/midr_el1"; + std::ifstream file; + + file.open(str.str(), std::ios::in); + + if (file.is_open()) { + std::string line; + + if (bool(getline(file, line))) { + const unsigned long midr = stoul(line, nullptr, 16); + + _percpu[i].midr = (midr & 0xffffffff); + _percpu[i].model = midr_to_model(_percpu[i].midr); + _percpu[i].model_set = true; + } + } + } + } + + /* If "long-form" cpuinfo is present, parse that to populate models. */ + void populate_models_cpuinfo() { + std::regex proc_regex("^processor.*(\\d+)$"); + std::regex imp_regex("^CPU implementer.*0x(..)$"); + std::regex var_regex("^CPU variant.*0x(.)$"); + std::regex part_regex("^CPU part.*0x(...)$"); + std::regex rev_regex("^CPU revision.*(\\d+)$"); + + std::ifstream file; + file.open("/proc/cpuinfo", std::ios::in); + + if (file.is_open()) { + std::string line; + int midr=0; + int curcpu=-1; + + while(bool(getline(file, line))) { + std::smatch match; + + if (std::regex_match(line, match, proc_regex)) { + std::string id = match[1]; + int newcpu=stoi(id, nullptr, 0); + + if (curcpu >= 0 && midr==0) { + // Matched a new CPU ID without any description of the previous one - looks like old format. + return; + } + + if (curcpu >= 0) { + _percpu[curcpu].midr = midr; + _percpu[curcpu].model = midr_to_model(midr); + _percpu[curcpu].model_set = true; + + printf("CPU %d: %x\n",curcpu,midr); + } + + midr=0; + curcpu=newcpu; + + continue; + } + + if (std::regex_match(line, match, imp_regex)) { + int impv = stoi(match[1], nullptr, 16); + midr |= (impv << 24); + continue; + } + + if (std::regex_match(line, match, var_regex)) { + int varv = stoi(match[1], nullptr, 16); + midr |= (varv << 16); + continue; + } + + if (std::regex_match(line, match, part_regex)) { + int partv = stoi(match[1], nullptr, 16); + midr |= (partv << 4); + continue; + } + + if (std::regex_match(line, match, rev_regex)) { + int regv = stoi(match[1], nullptr, 10); + midr |= (regv); + midr |= (0xf << 16); + continue; + } + } + + if (curcpu >= 0) { + _percpu[curcpu].midr = midr; + _percpu[curcpu].model = midr_to_model(midr); + _percpu[curcpu].model_set = true; + + printf("CPU %d: %x\n",curcpu,midr); + } + } + } + + /* Identify the maximum valid CPUID in the system. This reads + * /sys/devices/system/cpu/present to get the information. */ + int get_max_cpus() { + int max_cpus = 1; + +#ifndef BARE_METAL + std::ifstream CPUspresent; + CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in); + bool success = false; + + if (CPUspresent.is_open()) { + std::string line; + + if (bool(getline(CPUspresent, line))) { + /* The content of this file is a list of ranges or single values, e.g. + * 0-5, or 1-3,5,7 or similar. As we are interested in the + * max valid ID, we just need to find the last valid + * delimiter ('-' or ',') and parse the integer immediately after that. + */ + auto startfrom=line.begin(); + + for (auto i=line.begin(); i cpuid) { + _percpu[cpuid].model = model; + _percpu[cpuid].model_set = true; + } + } + + bool has_fp16() const { + return _fp16; + } + + bool has_dotprod() const { + return _dotprod; + } + + CPUModel get_cpu_model(unsigned long cpuid) const { + if (cpuid < _percpu.size()) { + return _percpu[cpuid].model; + } + + return CPUModel::GENERIC; + } + + CPUModel get_cpu_model() const { +#ifdef BARE_METAL + return get_cpu_model(0); +#else + return get_cpu_model(sched_getcpu()); +#endif + } + + unsigned int get_L1_cache_size() const { + return L1_cache_size; + } + + void set_L1_cache_size(unsigned int size) { + L1_cache_size = size; + } + + unsigned int get_L2_cache_size() const { + return L2_cache_size; + } + + void set_L2_cache_size(unsigned int size) { + L2_cache_size = size; + } +}; + +CPUInfo *get_CPUInfo(); diff --git a/arm_compute/core/NEON/kernels/assembly/profiler.hpp b/arm_compute/core/NEON/kernels/assembly/profiler.hpp deleted file mode 100644 index f7a1d1c70c..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/profiler.hpp +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef CYCLE_PROFILING - -#include "../perf.h" - -class profiler { -private: - static const int maxevents = 10000; - unsigned long times[maxevents]; - unsigned long units[maxevents]; - int events[maxevents]; - int currentevent; - int countfd; - -public: - profiler() { - currentevent=0; - countfd=open_cycle_counter(); - } - - ~profiler() { - close(countfd); - int tots[5]; - unsigned long counts[5]; - unsigned long tunits[5]; - const char * descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" }; - - for (int i=1; i<5; i++) { - tots[i] = 0; - counts[i] = 0; - tunits[i] = 0; - } - - printf("Profiled events:\n"); - for (int i=0; i - void operator() (int i, unsigned long u, T func) { - if (currentevent==maxevents) { - func(); - } else { - events[currentevent] = i; - units[currentevent] = u; - start_counter(countfd); - func(); - long long cycs = stop_counter(countfd); - times[currentevent++] = cycs; - } - } -}; - -#else - -class profiler { -public: - template - void operator() (int i, unsigned long u, T func) { - func(); - } -}; - -#endif - -#define PROFILE_PREPA 1 -#define PROFILE_PREPB 2 -#define PROFILE_KERNEL 3 -#define PROFILE_MERGE 4 - - diff --git a/arm_compute/core/NEON/kernels/assembly/transform.hpp b/arm_compute/core/NEON/kernels/assembly/transform.hpp deleted file mode 100644 index 717506f54c..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transform.hpp +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -/* - * Generic transform. - * - * Assuming the untransposed case, this works by first reading - * consecutive values from the first input row. This same number of values - * are then read from the next rows. Now return to the first - * input row and repeat. - * - * Need to cope with the work requested in either dimension not actually - * being a multiple of the block sizes. - */ -template -struct TransformImpl { - template - static void Transform(TOut* out, const TIn* const in, const int stride, - const int y0, const int ymax, const int x0, const int xmax) { - const int n_whole_y_blocks = (ymax - y0) / IntBy; - const int y_remainders = (ymax - y0) % IntBy; - const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0); - - const int n_whole_x_blocks = (xmax - x0) / BlockBy; - const int x_remainders = (xmax - x0) % BlockBy; - const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0); - - // "Y" loop: advance down the rows of the source IntBy rows at a time. - // Set up fill_rows to show the number rows to copy from, and blank_rows - // for the number of blank rows to add. - for (int y_block=0 ; y_block < n_y_blocks; y_block++) { - int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders; - int blank_rows = IntBy - fill_rows; - - int y_base = y0 + (y_block * IntBy); - - // So now advance along this block of rows, BlockBy columns at a time. - for (int x_block=0 ; x_block < n_x_blocks; x_block++) { - int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders; - int blank_cols = BlockBy - fill_cols; - - int x_base = x0 + (x_block * BlockBy); - - for (int row = 0; row < fill_rows; row++) { - for (int col = 0; col < fill_cols; col++) { - // In-range copy. If it's transposed, we reverse the sense of rows and columns here. - if (Transposed) { - *out++ = static_cast(in[(x_base + col) * stride + y_base + row]); - } else { - *out++ = static_cast(in[(y_base + row) * stride + x_base + col]); - } - } - // "col" tail - row is in range but column is out of range. - for (int col=0; col < blank_cols; col++) { - *out++ = static_cast(0); - } - } - // "row" tail - row is out of range so fill with zeros always. - for (int row = 0; row < blank_rows; row++) { - for (int col=0; col < (fill_cols + blank_cols); col++) { - *out++ = static_cast(0); - } - } - } - } - } - - template - static inline void Transform(T* out, const T* const in, const int stride, - const int k0, const int kmax, const int x0, const int xmax) { - Transform(out, in, stride, k0, kmax, x0, xmax); - } -}; - -/*****************************************************************************/ -template -void Transform( - TOut* out, const TIn* const in, const int stride, - const int k0, const int kmax, const int x0, const int xmax -) { - // Redirect to a specialised implementation predicated on argument size. - TransformImpl::Transform( - out, in, stride, k0, kmax, x0, xmax - ); -} -/*****************************************************************************/ - -#include "transforms/list.hpp" diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp deleted file mode 100644 index 4a1b5d2bf2..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __arm__ - -#include -#include "asmlib.hpp" - -template<> -template -inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint32_t *outptr = reinterpret_cast(out); - const uint32_t *inptr = reinterpret_cast(in); - - uint32_t zerobuff[8]; - - for (int y=y0; y7;x-=8) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if ((y + 5) >= ymax) { - switch ((y + 5) - ymax) { - /* Everything falls through in here */ - case 4: - inptr1 = zerobuff; - case 3: - inptr2 = zerobuff; - case 2: - inptr3 = zerobuff; - case 1: - inptr4 = zerobuff; - case 0: - inptr5 = zerobuff; - default: - break; - } - } - - - __asm __volatile ( - // Load up 8 elements (2 vectors) from each of 8 sources. - "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3 - "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3 - "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3 - "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3 - "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3 - "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3 - "VLD1.32 {d16-d19}, [%[inptr4]]!\n" - "VLD1.32 {d20-d23}, [%[inptr5]]!\n" - "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3 - ASM_PREFETCH("[%[inptr0], #128]") - "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1 - - // Store first elements - "VST1.32 {d0-d1}, [%[outptr]]!\n" - "VST1.32 {d16}, [%[outptr]]!\n" - - "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3 - - // Store second elements - "VST1.32 {d4-d5}, [%[outptr]]!\n" - "VZIP.32 q1, q5\n" - ASM_PREFETCH("[%[inptr1], #128]") - "VST1.32 {d17}, [%[outptr]]!\n" - "VZIP.32 q3, q7\n" - - // Store third elements - "VZIP.32 q9, q11\n" - "VST1.32 {d8-d9}, [%[outptr]]!\n" - "VZIP.32 q1, q3\n" - ASM_PREFETCH("[%[inptr2], #128]") - "VST1.32 {d20}, [%[outptr]]!\n" - - // Store fourth elements - "VZIP.32 q5, q7\n" - "VST1.32 {d12-d13}, [%[outptr]]!\n" - ASM_PREFETCH("[%[inptr3], #128]") - "VST1.32 {d21}, [%[outptr]]!\n" - - // Fifth - "VST1.32 {d2-d3}, [%[outptr]]!\n" - ASM_PREFETCH("[%[inptr4], #128]") - "VST1.32 {d18}, [%[outptr]]!\n" - - // Sixth - "VST1.32 {d6-d7}, [%[outptr]]!\n" - ASM_PREFETCH("[%[inptr5], #128]") - "VST1.32 {d19}, [%[outptr]]!\n" - - // Seventh - "VST1.32 {d10-d11}, [%[outptr]]!\n" - "VST1.32 {d22}, [%[outptr]]!\n" - - // Eigth - "VST1.32 {d14-d15}, [%[outptr]]!\n" - "VST1.32 {d23}, [%[outptr]]!\n" - - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr) - : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - } - } -} - -#endif // __arm__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp deleted file mode 100644 index a7e17fa074..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __arm__ - -#include "transpose_interleave_common.hpp" - -// Generic unblocked transposed 8x32-bit sized specialisation -template <> -template -inline void TransformImpl<8, 1, true, 4, 4>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a 16x uint16_t specialisation - TransformImpl<16, 1, true, 2, 2>::Transform( - reinterpret_cast(out), - reinterpret_cast(in), - stride*2, x0*2, xmax*2, k0, kmax - ); -} - -// Generic 12x16-bit sized specialisation -template <> -template -inline void TransformImpl<16, 1, true, 2, 2>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a uint16_t specialisation - Transform( - reinterpret_cast(out), - reinterpret_cast(in), - stride, x0, xmax, k0, kmax - ); -} - -// Specialised 16 x uint16_t version -template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { - __asm volatile ( - "VLD1.32 {d0-d3}, [%[in0]]!\n" - "VST1.32 {d0-d3}, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - : [in0] "+r" (in0), - [out] "+r" (out) - : - : "q0", "q1", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) { - __asm volatile ( - "VLD1.32 {d0-d3}, [%[in0]]!\n" - "VST1.32 {d0-d3}, [%[out]]!\n" - ASM_PREFETCH("[%[in0], #192]") - "VLD1.32 {d0-d3}, [%[in1]]!\n" - "VST1.32 {d0-d3}, [%[out]]\n" - ASM_PREFETCH("[%[in1], #192]") - "SUB %[out], %[out], #32\n" - : [in0] "+r" (in0), - [in1] "+r" (in1), - [out] "+r" (out) - : - : "q0", "q1", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { - __asm __volatile ( - "VLD1.32 {d0-d3}, [%[in0]]!\n" - "VST1.32 {d0-d3}, [%[out]]!\n" - ASM_PREFETCH("[%[in0], #192]") - "VLD1.32 {d0-d3}, [%[in1]]!\n" - "VST1.32 {d0-d3}, [%[out]]!\n" - ASM_PREFETCH("[%[in1], #192]") - "VLD1.32 {d0-d3}, [%[in2]]!\n" - "VST1.32 {d0-d3}, [%[out]]!\n" - ASM_PREFETCH("[%[in2], #192]") - "VLD1.32 {d0-d3}, [%[in3]]!\n" - "VST1.32 {d0-d3}, [%[out]]\n" - ASM_PREFETCH("[%[in3], #192]") - "SUB %[out], %[out], #96\n" - : [in0] "+r" (in0), - [in1] "+r" (in1), - [in2] "+r" (in2), - [in3] "+r" (in3), - [out] "+r" (out) - : - : "q0", "q1", "memory" - ); -} - -template <> -template <> -inline void TransformImpl<16, 1, true, 2, 2>::Transform( - uint16_t* out, const uint16_t* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); -} - -#endif // __arm__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_block16_interleave4_8bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_block16_interleave4_8bit.hpp deleted file mode 100644 index ac84567b54..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_block16_interleave4_8bit.hpp +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include -#include "asmlib.hpp" - -template<> -template -inline void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint8_t *outptr = (uint8_t *)out; - const uint8_t *inptr = (uint8_t *)in; - - uint8_t zerobuff[16]; - - for (int y=y0; y15;x-=16) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if ((y + 3) >= ymax) { - switch ((y + 3) - ymax) { - /* Everything falls through in here */ - case 2: - inptr1 = zerobuff; - case 1: - inptr2 = zerobuff; - case 0: - inptr3 = zerobuff; - default: - break; - } - } - - __asm __volatile ( - "LDR q0, [%[inptr0]], #16\n" - ASM_PREFETCH("[%[inptr0], #176]") - "LDR q1, [%[inptr1]], #16\n" - ASM_PREFETCH("[%[inptr1], #176]") - "STP q0, q1, [%[outptr]], #32\n" - "LDR q0, [%[inptr2]], #16\n" - ASM_PREFETCH("[%[inptr2], #176]") - "LDR q1, [%[inptr3]], #16\n" - ASM_PREFETCH("[%[inptr3], #176]") - "STP q0, q1, [%[outptr]], #32\n" - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [outptr] "+r" (outptr) - : - : "v0", "v1" - ); - } - - if (x>0) { - /* Need to duplicate this here, in case we didn't run the main loop. */ - if ((y + 3) >= ymax) { - switch ((y + 3) - ymax) { - /* Everything falls through in here */ - case 2: - inptr1 = zerobuff; - case 1: - inptr2 = zerobuff; - case 0: - inptr3 = zerobuff; - default: - break; - } - } - - /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */ - auto f = [&outptr, x](const uint8_t *&p) { - for (int i=0; i<16; i++) { - if (i < x) { - *outptr++ = *p++; - } else { - *outptr++ = 0; - } - } - }; - - f(inptr0); - f(inptr1); - f(inptr2); - f(inptr3); - } - } -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp deleted file mode 100644 index bdc05473b4..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include -#include "asmlib.hpp" - -template<> -template -void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint16_t *outptr = (uint16_t *)out; - const uint16_t *inptr = (const uint16_t *)in; - - uint16_t zerobuff[24]; - - for (int y=y0; y7;x-=8) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - /* Everything falls through in here */ - case 6: - inptr1 = zerobuff; - case 5: - inptr2 = zerobuff; - case 4: - inptr3 = zerobuff; - case 3: - inptr4 = zerobuff; - case 2: - inptr5 = zerobuff; - case 1: - inptr6 = zerobuff; - case 0: - inptr7 = zerobuff; - } - } - - int skippf = (x & 31); - __asm __volatile ( - // Load up 8 elements (1 vector) from each of 8 sources. - "CBNZ %w[skippf], 1f\n" - ASM_PREFETCH("[%[inptr0], #128]") - ASM_PREFETCH("[%[inptr1], #128]") - ASM_PREFETCH("[%[inptr2], #128]") - ASM_PREFETCH("[%[inptr3], #128]") - "1:\n" - - "LDR q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7 - "LDR q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7 - "LDR q2, [%[inptr2]], #16\n" // q4=C0C1C2C3... - "LDR q6, [%[inptr6]], #16\n" - "ZIP1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3 - "ZIP2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7 - "ZIP1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3 - "ZIP2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7 - "LDR q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7 - "LDR q5, [%[inptr5]], #16\n" - "LDR q3, [%[inptr3]], #16\n" // q3=D0D1D2D3.... - "LDR q7, [%[inptr7]], #16\n" - "ZIP1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3 - "ZIP2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7 - "ZIP1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3 - "ZIP2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7 - - "ZIP1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1 - "ZIP2 v20.8h, v8.8h, v9.8h\n" - "ZIP1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1 - "ZIP2 v21.8h, v10.8h, v11.8h\n" - - "CBNZ %w[skippf], 2f\n" - ASM_PREFETCH("[%[inptr4], #112]") - ASM_PREFETCH("[%[inptr5], #112]") - ASM_PREFETCH("[%[inptr6], #112]") - ASM_PREFETCH("[%[inptr7], #112]") - "2:\n" - - "ZIP1 v22.8h, v16.8h, v17.8h\n" - "ZIP2 v30.8h, v16.8h, v17.8h\n" - "ZIP1 v23.8h, v18.8h, v19.8h\n" - "ZIP2 v31.8h, v18.8h, v19.8h\n" - - "ZIP1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0 - "ZIP2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1 - "STP q14, q15, [%[outptr]], #32\n" // Write back first two elements - - "ZIP1 v0.8h, v20.8h, v21.8h\n" - "ZIP2 v1.8h, v20.8h, v21.8h\n" - "STP q0, q1, [%[outptr]], #32\n" // Write back next two elements - - "ZIP1 v2.8h, v22.8h, v23.8h\n" - "ZIP2 v3.8h, v22.8h, v23.8h\n" - "STP q2, q3, [%[outptr]], #32\n" // Write back next two elements - - "ZIP1 v4.8h, v30.8h, v31.8h\n" - "ZIP2 v5.8h, v30.8h, v31.8h\n" - "STP q4, q5, [%[outptr]], #32\n" // Write back last two elements - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) - : [skippf] "r" (skippf) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp deleted file mode 100644 index bd5125afab..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include -#include "asmlib.hpp" - -template<> -template -inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { - uint32_t *outptr = (uint32_t *)out; - const uint32_t *inptr = (uint32_t *)in; - - uint32_t zerobuff[8]; - - for (int y=y0; y7;x-=8) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - /* Everything falls through in here */ - case 6: - inptr1 = zerobuff; - case 5: - inptr2 = zerobuff; - case 4: - inptr3 = zerobuff; - case 3: - inptr4 = zerobuff; - case 2: - inptr5 = zerobuff; - case 1: - inptr6 = zerobuff; - case 0: - inptr7 = zerobuff; - default: - break; - } - } - - __asm __volatile ( - // Load up 8 elements (2 vectors) from each of 8 sources. - "LDP q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3 - "LDP q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3 - "LDP q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3 - "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 - ASM_PREFETCH("[%[inptr0], #128]") - "LDP q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3 - "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 - "LDP q8, q9, [%[inptr4]], #32\n" - "LDP q10, q11, [%[inptr5]], #32\n" - "LDP q12, q13, [%[inptr6]], #32\n" - "ZIP1 v18.4s, v8.4s, v12.4s\n" - ASM_PREFETCH("[%[inptr1], #128]") - "LDP q14, q15, [%[inptr7]], #32\n" - "ZIP1 v19.4s, v10.4s, v14.4s\n" - - "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0 - ASM_PREFETCH("[%[inptr2], #128]") - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP2 v16.4s, v0.4s, v4.4s\n" - ASM_PREFETCH("[%[inptr3], #128]") - "ZIP2 v17.4s, v2.4s, v6.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source - - "ZIP2 v18.4s, v8.4s, v12.4s\n" - "ZIP2 v19.4s, v10.4s, v14.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - ASM_PREFETCH("[%[inptr4], #128]") - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP1 v16.4s, v1.4s, v5.4s\n" - ASM_PREFETCH("[%[inptr5], #128]") - "ZIP1 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Third element - - "ZIP1 v18.4s, v9.4s, v13.4s\n" - "ZIP1 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Fourth element - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - ASM_PREFETCH("[%[inptr6], #128]") - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP2 v16.4s, v1.4s, v5.4s\n" - "ZIP2 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Fifth element - - "ZIP2 v18.4s, v9.4s, v13.4s\n" - ASM_PREFETCH("[%[inptr7], #128]") - "ZIP2 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Sixth element - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Seventh element - - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Eighth element - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp deleted file mode 100644 index 3c9e05223d..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#if defined( __aarch64__) && defined( __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - -#include -#include "asmlib.hpp" - -template<> -template<> -inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax) { - float *outptr = out; - const __fp16 *inptr = in; - - __fp16 zerobuff[8]; - - for (int y=y0; y7;x-=8) { - /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - /* Everything falls through in here */ - case 6: - inptr1 = zerobuff; - case 5: - inptr2 = zerobuff; - case 4: - inptr3 = zerobuff; - case 3: - inptr4 = zerobuff; - case 2: - inptr5 = zerobuff; - case 1: - inptr6 = zerobuff; - case 0: - inptr7 = zerobuff; - default: - break; - } - } - - __asm __volatile ( - // Load up 8 elements (2 vectors) from each of 8 sources. - "LDR q0, [%[inptr0]], #16\n" - "LDR q2, [%[inptr1]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3 - "FCVTL2 v3.4s, v2.8h\n" - "FCVTL v2.4s, v2.4h\n" - "FCVTL2 v5.4s, v4.8h\n" - "FCVTL v4.4s, v4.4h\n" - "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 - ASM_PREFETCH("[%[inptr0], #128]") - "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3 - "FCVTL2 v7.4s, v6.8h\n" - "FCVTL v6.4s, v6.4h\n" - "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 - "LDR q8, [%[inptr4]], #16\n" - "LDR q10, [%[inptr5]], #16\n" - "FCVTL2 v9.4s, v8.8h\n" - "FCVTL v8.4s, v8.4h\n" - ASM_PREFETCH("[%[inptr1], #128]") - "LDR q12, [%[inptr6]], #16\n" - "FCVTL2 v11.4s, v10.8h\n" - "FCVTL v10.4s, v10.4h\n" - "FCVTL2 v13.4s, v12.8h\n" - "FCVTL v12.4s, v12.4h\n" - "ZIP1 v18.4s, v8.4s, v12.4s\n" - "LDR q14, [%[inptr7]], #16\n" - "FCVTL2 v15.4s, v14.8h\n" - "FCVTL v14.4s, v14.4h\n" - "ZIP1 v19.4s, v10.4s, v14.4s\n" - - ASM_PREFETCH("[%[inptr2], #128]") - "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0 - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - ASM_PREFETCH("[%[inptr3], #128]") - - "ZIP2 v16.4s, v0.4s, v4.4s\n" - "ZIP2 v17.4s, v2.4s, v6.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source - - "ZIP2 v18.4s, v8.4s, v12.4s\n" - ASM_PREFETCH("[%[inptr4], #128]") - "ZIP2 v19.4s, v10.4s, v14.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - ASM_PREFETCH("[%[inptr5], #128]") - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP1 v16.4s, v1.4s, v5.4s\n" - "ZIP1 v17.4s, v3.4s, v7.4s\n" - ASM_PREFETCH("[%[inptr6], #128]") - "STP q20, q21, [%[outptr]], #32\n" // Third element - - "ZIP1 v18.4s, v9.4s, v13.4s\n" - "ZIP1 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Fourth element - ASM_PREFETCH("[%[inptr7], #128]") - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP2 v16.4s, v1.4s, v5.4s\n" - "ZIP2 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Fifth element - - "ZIP2 v18.4s, v9.4s, v13.4s\n" - "ZIP2 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Sixth element - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Seventh element - - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Eighth element - : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), - [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" - ); - } - - for (;x>0;x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp deleted file mode 100644 index 6e07064a0c..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "transpose_interleave_common.hpp" - -// Generic unblocked transposed 6x32-bit sized specialisation -template <> -template -inline void TransformImpl<6, 1, true, 4, 4>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a 12 x uint16_t specialisation - TransformImpl<12, 1, true, 2, 2>::Transform( - reinterpret_cast(out), - reinterpret_cast(in), - stride*2, x0*2, xmax*2, k0, kmax - ); -} - -// Generic 12x16-bit sized specialisation -template <> -template -inline void TransformImpl<12, 1, true, 2, 2>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a uint16_t specialisation - Transform( - reinterpret_cast(out), - reinterpret_cast(in), - stride, x0, xmax, k0, kmax - ); -} - -// Specialised 12 x uint16_t version -template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { - __asm volatile ( - "LDR q0, [%[in0]]\n" - "STR q0, [%[out]]\n" - "LDR d1, [%[in0], #0x10]\n" - "STR d1, [%[out], #0x10]\n" - "ADD %x[in0], %x[in0], #0x18\n" - ASM_PREFETCH("[%[in0], #192]") - : [in0] "+r" (in0), - [out] "+r" (out) - : - : "v0", "v1", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) { - __asm volatile ( - "LDR q0, [%[in0]]\n" - "LDR d1, [%[in0], #0x10]\n" - "ADD %x[in0], %x[in0], #0x18\n" - ASM_PREFETCH("[%[in0], #192]") - - "LDR x21, [%[in1]]\n" - "LDR q2, [%[in1], #0x08]\n" - "INS v1.d[1], x21\n" - "ADD %x[in1], %x[in1], #0x18\n" - "STP q0, q1, [%[out]]\n" - "STR q2, [%x[out], #0x20]\n" - ASM_PREFETCH("[%[in1], #192]") - : [in0] "+r" (in0), - [in1] "+r" (in1), - [out] "+r" (out) - : - : "x21", "v0", "v1", "v2", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { - __asm __volatile ( - "LDR q0, [%x[in0]], #0x10\n" - "STR q0, [%x[out]]\n" - "LDR d1, [%x[in0]], #0x08\n" - ASM_PREFETCH("[%[in0], #192]") - "STR d1, [%x[out], #0x10]\n" - - "LDR q0, [%x[in1]], #0x10\n" - "STR q0, [%x[out], #0x18]\n" - "LDR d1, [%x[in1]], #0x08\n" - ASM_PREFETCH("[%[in1], #192]") - "STR d1, [%x[out], #0x28]\n" - - "LDR q0, [%x[in2]], #0x10\n" - "STR q0, [%x[out], #0x30]\n" - "LDR d1, [%x[in2]], #0x08\n" - ASM_PREFETCH("[%[in2], #192]") - "STR d1, [%x[out], #0x40]\n" - - "LDR q0, [%x[in3]], #0x10\n" - "STR q0, [%x[out], #0x48]\n" - "LDR d1, [%x[in3]], #0x08\n" - ASM_PREFETCH("[%[in3], #192]") - "STR d1, [%x[out], #0x58]\n" - : [in0] "+r" (in0), - [in1] "+r" (in1), - [in2] "+r" (in2), - [in3] "+r" (in3), - [out] "+r" (out) - : - : "v0", "v1", "memory" - ); -} - -template <> -template <> -inline void TransformImpl<12, 1, true, 2, 2>::Transform( - uint16_t* out, const uint16_t* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp deleted file mode 100644 index 835e4d87aa..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#if defined( __aarch64__) && defined( __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - -#include "transpose_interleave_common.hpp" - -template <> -inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out) { - __asm __volatile ( - "LDR q0, [%[in0]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "STP q0, q1, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - "LDR d2, [%[in0]], #8\n" - "FCVTL v2.4s, v2.4h\n" - "STR q2, [%[out], #32]\n" - : [in0] "+r" (in0), [out] "+r" (out) - : - : "v0", "v1", "v2", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out) { - __asm __volatile ( - "LDR q0, [%[in0]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "STP q0, q1, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - "LDR d2, [%[in0]], #8\n" - "FCVTL v2.4s, v2.4h\n" - "LDR q3, [%[in1]], #16\n" - "FCVTL2 v4.4s, v3.8h\n" - "FCVTL v3.4s, v3.4h\n" - "STP q2, q3, [%[out], #32]\n" - ASM_PREFETCH("[%[in1], #192]") - "LDR d5, [%[in1]], #16\n" - "FCVTL v5.4s, v5.4h\n" - "STP q4, q5, [%[out], #64]\n" - : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out) { - __asm __volatile ( - "LDR q0, [%[in0]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "STP q0, q1, [%[out]]\n" - "LDR d2, [%[in0]], #8\n" - ASM_PREFETCH("[%[in0], #192]") - "FCVTL v2.4s, v2.4h\n" - "LDR q3, [%[in1]], #16\n" - "FCVTL2 v4.4s, v3.8h\n" - "FCVTL v3.4s, v3.4h\n" - "STP q2, q3, [%[out], #32]\n" - "LDR d5, [%[in1]], #8\n" - "FCVTL v5.4s, v5.4h\n" - ASM_PREFETCH("[%[in1], #192]") - "STP q4, q5, [%[out], #64]\n" - "LDR q6, [%[in2]], #16\n" - "FCVTL2 v7.4s, v6.8h\n" - "FCVTL v6.4s, v6.4h\n" - "STP q6, q7, [%[out], #96]\n" - "LDR d8, [%[in2]], #8\n" - "FCVTL v8.4s, v8.4h\n" - ASM_PREFETCH("[%[in2], #192]") - "LDR q9, [%[in3]], #16\n" - "FCVTL2 v10.4s, v9.8h\n" - "FCVTL v9.4s, v9.4h\n" - "STP q8, q9, [%[out], #128]\n" - "LDR d11, [%[in3]], #8\n" - "FCVTL v11.4s, v11.4h\n" - "STP q10, q11, [%[out], #160]\n" - ASM_PREFETCH("[%[in3], #192]") - - : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory" - ); -} - -template <> -template <> -inline void TransformImpl<12, 1, true, 4, 2>::Transform( - float* out, const __fp16* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax); -} - -#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp deleted file mode 100644 index b6565baa23..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ - -#include "transpose_interleave_common.hpp" - -// Generic unblocked transposed 12x32-bit sized specialisation -template <> -template -inline void TransformImpl<12, 1, true, 4, 4>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a 24 x uint16_t specialisation - TransformImpl<24, 1, true, 2, 2>::Transform( - reinterpret_cast(out), - reinterpret_cast(in), - stride*2, x0*2, xmax*2, k0, kmax - ); -} - -// Generic 24x16-bit sized specialisation -template <> -template -inline void TransformImpl<24, 1, true, 2, 2>::Transform( - T* out, const T* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - // Redirect to a uint16_t specialisation - Transform( - reinterpret_cast(out), - reinterpret_cast(in), - stride, x0, xmax, k0, kmax - ); -} - -// Specialised 24 x uint16_t version -template <> -inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { - __asm __volatile ( - "LDP q0, q1, [%[in0]], #32\n" - "STP q0, q1, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - "LDR q2, [%[in0]], #16\n" - "STR q2, [%[out], #32]\n" - : [in0] "+r" (in0), [out] "+r" (out) - : - : "v0", "v1", "v2", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1,uint16_t *out) { - __asm __volatile ( - "LDP q0, q1, [%[in0]], #32\n" - "STP q0, q1, [%[out]]\n" - ASM_PREFETCH("[%[in0], #192]") - "LDR q2, [%[in0]], #16\n" - "LDP q3, q4, [%[in1]], #32\n" - "STP q2, q3, [%[out], #32]\n" - ASM_PREFETCH("[%[in1], #192]") - "LDR q5, [%[in1]], #16\n" - "STP q4, q5, [%[out], #64]\n" - : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "memory" - ); -} - -template <> -inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { - __asm __volatile ( - "LDP q0, q1, [%[in0]], #32\n" - "STP q0, q1, [%[out]]\n" - "LDR q2, [%[in0]], #16\n" - ASM_PREFETCH("[%[in0], #192]") - "LDP q3, q4, [%[in1]], #32\n" - "STP q2, q3, [%[out], #32]\n" - "LDR q5, [%[in1]], #16\n" - ASM_PREFETCH("[%[in1], #192]") - "STP q4, q5, [%[out], #64]\n" - "LDP q6, q7, [%[in2]], #32\n" - "STP q6, q7, [%[out], #96]\n" - "LDR q8, [%[in2]], #16\n" - ASM_PREFETCH("[%[in2], #192]") - "LDP q9, q10, [%[in3]], #32\n" - "STP q8, q9, [%[out], #128]\n" - "LDR q11, [%[in3]], #16\n" - "STP q10, q11, [%[out], #160]\n" - ASM_PREFETCH("[%[in3], #192]") - - : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory" - ); -} - -template <> -template <> -inline void TransformImpl<24, 1, true, 2, 2>::Transform( - uint16_t* out, const uint16_t* const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax -) { - TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); -} - -#endif // __arch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp deleted file mode 100644 index 8ad5b857fb..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "a32_interleave_6way_32bit.hpp" -#include "a32_transpose_interleave_8way_32bit.hpp" -#include "a64_block16_interleave4_8bit.hpp" -#include "a64_interleave_8way_16bit.hpp" -#include "a64_interleave_8way_32bit.hpp" -#include "a64_interleave_8way_half_to_float.hpp" -#include "a64_transpose_interleave_12way_16bit.hpp" -#include "a64_transpose_interleave_12way_half_to_float.hpp" -#include "a64_transpose_interleave_24way_16bit.hpp" -#include "transpose_interleave_common.hpp" diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp deleted file mode 100644 index 231b3f181e..0000000000 --- a/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -template -struct TransposeInterleaveCommon { - // Override the moveblock_1xY methods to improve performance - static inline void moveblock_1x1(const TIn *&in0, TOut *out) { - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast(*in0++); - } - } - - static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) { - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast(*in0++); - } - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast(*in1++); - } - } - - static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) { - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast(*in0++); - } - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast(*in1++); - } - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast(*in2++); - } - for (unsigned int i = 0; i < IntBy; i++) { - *out++ = static_cast(*in3++); - } - } - - static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) { - const auto ldin = stride; - - TOut *outarray = out; - const TIn *inarray = in; - TOut *outptr_base = outarray; - const TIn *inptr_base = inarray + x0 + (k0 * ldin); - int ldout = (kmax - k0) * IntBy; - - int k=(kmax-k0); - for ( ; k>3; k-=4) { - TOut *outptr = outptr_base; - const TIn *inptr = inptr_base; - const TIn *inptr1 = inptr + ldin; - const TIn *inptr2 = inptr1 + ldin; - const TIn *inptr3 = inptr2 + ldin; - - prefetch_3x(inptr); - prefetch_3x(inptr1); - prefetch_3x(inptr2); - prefetch_3x(inptr3); - - outptr_base += IntBy * 4; - inptr_base += ldin * 4; - - for (int x = (xmax-x0) / IntBy; x > 0 ; x--) { - moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr); - outptr += ldout; - } - } - - if (k) { - TOut *outptr = outptr_base; - const TIn *inptr = inptr_base; - const TIn *inptr1 = inptr + ldin; - const TIn *inptr2 = inptr1 + ldin; - - prefetch_3x(inptr); - prefetch_3x(inptr1); - prefetch_3x(inptr2); - - for (int x = (xmax-x0) / IntBy; x > 0 ; x--) { - switch(k) { - case 3: - moveblock_1x2(inptr, inptr1, outptr); - moveblock_1x1(inptr2, outptr + IntBy * 2); - break; - - case 2: - moveblock_1x2(inptr, inptr1, outptr); - break; - - case 1: - moveblock_1x1(inptr, outptr); - break; - default: - break; - } - - outptr += ldout; - } - } - - // Cope with ragged X cases - const unsigned int overflow = (xmax - x0) % IntBy; - if (overflow) { - const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin); - TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout; - - for (int k=(kmax-k0); k>0; k--) { - const TIn *inptr = inptr_base; - inptr_base += ldin; - - for (unsigned int x=0; x < IntBy; x++) { - TOut val = (x < overflow) ? static_cast(*inptr++) : static_cast(0); - *outptr++ = val; - } - } - } -} -}; diff --git a/arm_compute/runtime/NEON/AssemblyHelper.h b/arm_compute/runtime/NEON/AssemblyHelper.h new file mode 100644 index 0000000000..2b304b8022 --- /dev/null +++ b/arm_compute/runtime/NEON/AssemblyHelper.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_ASSEMBLY_HELPER_H__ +#define __ARM_ASSEMBLY_HELPER_H__ + +#include "arm_compute/core/ITensor.h" +#include "support/ToolchainSupport.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/Log.h" +#include "arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h" +#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +template +class AssemblyKernelGlue final +{ +public: + using TypeOperator = TypeInput; + using TypeResult = TypeOutput; + AssemblyKernelGlue() + : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr) + { + } + using AssemblyGemm = arm_gemm::GemmCommon; + + const AssemblyKernelGlue &operator=(const AssemblyKernelGlue &) = delete; + AssemblyKernelGlue(const AssemblyKernelGlue &) = delete; + + std::unique_ptr _gemm_kernel_asm; + std::unique_ptr _optimised_kernel; + const ITensor *_a; + const ITensor *_b; + ITensor *_d; + + /** Configures the arrays pointers and strides in the assembly kernel and executes the assembly kernel. + * The call to set_arrays is needed to deal with the input sizes containing batches (dims > 2) + */ + inline void run() + { + const int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput); + const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput); + const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput); + + // Configure kernel window + Window window = calculate_max_window(*_d->info()); + const auto in1_ptr = reinterpret_cast(_b->buffer()); + + // Only iterate over batches + Window win(window); + win.set(0, Window::Dimension(0, 1, 1)); + win.set(1, Window::Dimension(0, 1, 1)); + Iterator in0(_a, window); + Iterator out(_d, window); + execute_window_loop(win, [&](const Coordinates &) + { + const auto in0_ptr = reinterpret_cast(in0.ptr()); + auto out_ptr = reinterpret_cast(out.ptr()); + _gemm_kernel_asm->set_arrays(in0_ptr, lda, in1_ptr, ldb, out_ptr, ldd); + NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX); + }, + in0, out); + } +}; + +using AssemblyKernelGlueF32 = AssemblyKernelGlue; +using AssemblyKernelGlueU8U32 = AssemblyKernelGlue; +using AssemblyKernelGlueS8S32 = AssemblyKernelGlue; + +inline void allocate_workspace(size_t workspace_size, Tensor &workspace, MemoryGroup &memory_group, size_t alignment, unsigned int num_threads) +{ + ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0"); + workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment - 1) * num_threads }, 1, DataType::S8)); + workspace.allocator()->allocate(); +} + +template +std::unique_ptr> create_wrapper_kernel(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta) +{ + // rework this function, why are we checking data type and other things here ? should we create another function can_run_optimised_kernel() ? +#if defined(__arm__) + if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)) + { + return support::cpp14::make_unique>(); + } +#elif defined(__aarch64__) + if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)) + { + return support::cpp14::make_unique>(); + } + else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f)) + { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return support::cpp14::make_unique>(); +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16."); +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + } +#endif /* defined(__arm__) || defined(__aarch64__) */ + return nullptr; +} + +template +inline bool setup_assembly_kernel(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, + Tensor &workspace, MemoryGroup &memory_group, T &asm_glue) +{ + const ::CPUInfo *ci = get_CPUInfo(); + const int M = d->info()->tensor_shape().y(); + const int N = d->info()->tensor_shape().x(); + const int K = a->info()->tensor_shape().x(); + unsigned int num_threads = NEScheduler::get().num_threads(); + // unique_ptr to a Gemm object + std::unique_ptr asm_gemm(arm_gemm::gemm(*ci, M, N, K, false, false, alpha, beta, num_threads, + false)); + + // arm_compute wrapper for the Gemm object (see above) + std::unique_ptr> acl_gemm_wrapper = create_wrapper_kernel(a, b, c, d, alpha, beta); + if(acl_gemm_wrapper != nullptr && asm_gemm != nullptr) + { + acl_gemm_wrapper->configure(asm_gemm.get()); + const size_t workspace_size = asm_gemm->get_working_size(); + if(workspace_size) + { + // Allocate workspace + allocate_workspace(workspace_size, workspace, memory_group, 4096, num_threads); + asm_gemm->set_working_space(reinterpret_cast(workspace.buffer())); + } + const unsigned int window_size = asm_gemm->get_window_size(); + if(window_size < num_threads) + { + num_threads = window_size; + asm_gemm->set_nthreads(num_threads); + } + asm_glue._gemm_kernel_asm = std::move(asm_gemm); + asm_glue._optimised_kernel = std::move(acl_gemm_wrapper); + // We need to setup the ptrs in the run() method + asm_glue._a = a; + asm_glue._b = b; + asm_glue._d = d; + return true; + } + return false; +} +} +#endif /* __ARM_ASSEMBLY_HELPER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index f2b6ef77bd..5279995be4 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -25,7 +25,6 @@ #define __ARM_COMPUTE_NEGEMM_H__ #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" @@ -35,6 +34,8 @@ #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" +#include "arm_compute/runtime/NEON/AssemblyHelper.h" + #include namespace arm_compute @@ -73,19 +74,19 @@ public: void run() override; private: - MemoryGroup _memory_group; - NEGEMMInterleave4x4Kernel _interleave_kernel; - NEGEMMTranspose1xWKernel _transpose_kernel; - NEGEMMMatrixMultiplyKernel _mm_kernel; - std::unique_ptr _mm_optimised_kernel; - NEGEMMMatrixAdditionKernel _ma_kernel; - Tensor _tmp_a; - Tensor _tmp_b; - Tensor _workspace; - bool _run_vector_matrix_multiplication; - bool _run_addition; - bool _is_first_run; - bool _reshape_b_only_on_first_run; + MemoryGroup _memory_group; + NEGEMMInterleave4x4Kernel _interleave_kernel; + NEGEMMTranspose1xWKernel _transpose_kernel; + NEGEMMMatrixMultiplyKernel _mm_kernel; + AssemblyKernelGlueF32 _asm_glue; + NEGEMMMatrixAdditionKernel _ma_kernel; + Tensor _tmp_a; + Tensor _tmp_b; + Tensor _workspace; + bool _run_vector_matrix_multiplication; + bool _run_addition; + bool _is_first_run; + bool _reshape_b_only_on_first_run; }; } #endif /*__ARM_COMPUTE_NEGEMM_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index ac5f4caa78..4ae8ee1fb3 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -36,6 +36,7 @@ #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/AssemblyHelper.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" #include "arm_compute/runtime/Tensor.h" @@ -149,22 +150,14 @@ private: * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped */ void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo()); - /** Prepare the appropriate assembly optimized kernel - * - * @param[in] ci CPU information - * @param[in] M M parameter of matrix multiplication - * @param[in] N N parameter of matrix multiplication - * @param[in] K K parameter of matrix multiplication - */ - void configure_asm_mm(const struct CPUInfo &ci, int M, int N, int K); private: + AssemblyKernelGlueF32 _asm_glue; MemoryGroup _memory_group; NEIm2ColKernel _input_im2col_kernel; NEGEMMInterleave4x4Kernel _input_interleave_kernel; NEConvolutionLayerReshapeWeights _reshape_weights; NEGEMMMatrixMultiplyKernel _mm_kernel; - std::unique_ptr _mm_optimised_kernel; NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage; NECol2ImKernel _output_col2im_kernel; diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h index 3d213a7668..f09c94e726 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h @@ -1,6 +1,6 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/AssemblyHelper.h" #include "arm_compute/runtime/Tensor.h" #include @@ -58,6 +59,8 @@ public: private: MemoryGroup _memory_group; + AssemblyKernelGlueU8U32 _asm_glue_unsigned; + AssemblyKernelGlueS8S32 _asm_glue_signed; std::unique_ptr _mm_kernel; std::unique_ptr _mtx_a_reshape_kernel; std::unique_ptr _mtx_b_reshape_kernel; diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index eddb3a26b7..95776f829a 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,7 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/AssemblyHelper.h" #include "arm_compute/runtime/Tensor.h" #include @@ -48,7 +49,6 @@ class ITensor; * otherwise if the DOT product instruction is available: * * -# @ref NEGEMMInterleaveBlockedKernel - * -# @ref NEGEMMLowpAArch64V8P4Kernel * -# @ref NEGEMMLowpOffsetContributionKernel * */ @@ -90,6 +90,8 @@ public: private: MemoryGroup _memory_group; + AssemblyKernelGlueU8U32 _asm_glue_unsigned; + AssemblyKernelGlueS8S32 _asm_glue_signed; std::unique_ptr _mm_kernel; std::unique_ptr _mtx_a_reshape_kernel; std::unique_ptr _mtx_b_reshape_kernel; diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox index eb6130bda5..555cec5c35 100644 --- a/docs/00_introduction.dox +++ b/docs/00_introduction.dox @@ -195,6 +195,12 @@ If there is more than one release in a month then an extra sequential number is @subsection S2_2_changelog Changelog +v18.05 Public maintenance release + - Major redesign in the interface for the neon kernels implemented in assembly. + - Removed arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore / arm_compute::NEHGEMMAArch64FP16Kernel + - Added NEGEMMAssemblyWrapper and AssemblyKernelGlue which are used to execute assembly kernels in neon functions. + - Minor changes to the CPUInfo type to make it compatible with the new assembly gemm interface. + v18.03 Public maintenance release - Various bug fixes. - Fixed bug in @ref NEActivationLayer @@ -301,8 +307,8 @@ v17.12 Public major release - @ref GCTransposeKernel / @ref GCTranspose - New NEON kernels / functions - - @ref NEGEMMLowpAArch64A53Kernel / @ref NEGEMMLowpAArch64Kernel / @ref NEGEMMLowpAArch64V8P4Kernel / NEGEMMInterleavedBlockedKernel / @ref NEGEMMLowpAssemblyMatrixMultiplyCore - - @ref NEHGEMMAArch64FP16Kernel + - arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore + - arm_compute::NEHGEMMAArch64FP16Kernel - @ref NEDepthwiseConvolutionLayer3x3Kernel / @ref NEDepthwiseIm2ColKernel / @ref NEGEMMMatrixVectorMultiplyKernel / @ref NEDepthwiseVectorToTensorKernel / @ref NEDepthwiseConvolutionLayer - @ref NEGEMMLowpOffsetContributionKernel / @ref NEGEMMLowpMatrixAReductionKernel / @ref NEGEMMLowpMatrixBReductionKernel / @ref NEGEMMLowpMatrixMultiplyCore - @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint @@ -340,7 +346,7 @@ v17.09 Public major release - New validation and benchmark frameworks (Boost and Google frameworks replaced by homemade framework). - Most machine learning functions support both fixed point 8 and 16 bit (QS8, QS16) for both NEON and OpenCL. - New NEON kernels / functions: - - @ref NEGEMMAssemblyBaseKernel @ref NEGEMMAArch64Kernel + - arm_compute::NEGEMMAssemblyBaseKernel arm_compute::NEGEMMAArch64Kernel - @ref NEDequantizationLayerKernel / @ref NEDequantizationLayer - @ref NEFloorKernel / @ref NEFloor - @ref NEL2NormalizeLayerKernel / @ref NEL2NormalizeLayer diff --git a/examples/graph_inception_v4.cpp b/examples/graph_inception_v4.cpp index d9f6156fb2..6f76b5e2e7 100644 --- a/examples/graph_inception_v4.cpp +++ b/examples/graph_inception_v4.cpp @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/core/Error.h" #include "arm_compute/graph2.h" #include "support/ToolchainSupport.h" #include "utils/GraphUtils.h" @@ -43,6 +44,9 @@ class InceptionV4Example final : public Example public: void do_setup(int argc, char **argv) override { + // Disabled the test for now because the process gets killed on Linux Firefly 32 bit even when using ConvolutionMethodHint::DIRECT. + // Needs to review/rework to run the code below. +#if __aarch64__ std::string data_path; /* Path to the trainable data */ std::string image; /* Image data */ std::string label; /* Label data */ @@ -90,7 +94,6 @@ public: graph << target_hint << InputLayer(TensorDescriptor(TensorShape(299U, 299U, 3U, 1U), DataType::F32), get_input_accessor(image, std::move(preprocessor), false)) - // Conv2d_1a_3x3 << ConvolutionLayer(3U, 3U, 32U, get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_weights.npy"), @@ -157,11 +160,18 @@ public: // Finalize graph graph.finalize(target_hint, enable_tuning, enable_memory_management); +#else /* __aarch64__ */ + using namespace arm_compute; + ARM_COMPUTE_UNUSED(argc); + ARM_COMPUTE_UNUSED(argv); +#endif /* __aarch64__ */ } void do_run() override { +#if __aarch64__ graph.run(); +#endif /* __aarch64__ */ } private: diff --git a/scripts/check_bad_style.sh b/scripts/check_bad_style.sh index e5dc15c218..292cf518cd 100755 --- a/scripts/check_bad_style.sh +++ b/scripts/check_bad_style.sh @@ -4,7 +4,7 @@ set -e DIRECTORIES="./arm_compute ./src ./examples ./tests ./utils ./support" -grep -HrnP --exclude-dir=assembly --exclude-dir=convolution "/\*\*$" $DIRECTORIES | tee bad_style.log +grep -HrnP --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "/\*\*$" $DIRECTORIES | tee bad_style.log if (( `cat bad_style.log | wc -l` > 0 )) then echo "" @@ -12,7 +12,7 @@ then exit -1 fi -grep -Hnr --exclude-dir=assembly --exclude-dir=convolution --exclude=Doxyfile "@brief" $DIRECTORIES | tee bad_style.log +grep -Hnr --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm --exclude=Doxyfile "@brief" $DIRECTORIES | tee bad_style.log if (( `cat bad_style.log | wc -l` > 0 )) then echo "" @@ -20,7 +20,7 @@ then exit -1 fi -grep -HnRE --exclude-dir=assembly --exclude-dir=convolution "\buint " --exclude-dir=cl_kernels --exclude-dir=cs_shaders $DIRECTORIES | tee bad_style.log +grep -HnRE --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "\buint " --exclude-dir=cl_kernels --exclude-dir=cs_shaders $DIRECTORIES | tee bad_style.log if [[ $(cat bad_style.log | wc -l) > 0 ]] then echo "" @@ -28,7 +28,7 @@ then exit -1 fi -grep -HnR --exclude-dir=assembly --exclude-dir=convolution "float32_t" $DIRECTORIES | tee bad_style.log +grep -HnR --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "float32_t" $DIRECTORIES | tee bad_style.log if [[ $(cat bad_style.log | wc -l) > 0 ]] then echo "" @@ -36,7 +36,7 @@ then exit -1 fi -grep -Hnir --exclude-dir=assembly --exclude-dir=convolution "arm[_ ]\?cv" $DIRECTORIES | tee bad_style.log +grep -Hnir --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "arm[_ ]\?cv" $DIRECTORIES | tee bad_style.log if [[ $(cat bad_style.log | wc -l) > 0 ]] then echo "" @@ -44,7 +44,7 @@ then exit -1 fi -grep -Hnir --exclude-dir=assembly --exclude-dir=convolution "#.*if.*defined[^(]" $DIRECTORIES | tee bad_style.log +grep -Hnir --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "#.*if.*defined[^(]" $DIRECTORIES | tee bad_style.log if [[ $(cat bad_style.log | wc -l) > 0 ]] then echo "" @@ -52,7 +52,7 @@ then exit -1 fi -grep -Hnir --exclude-dir=assembly --exclude-dir=convolution "#else$\|#endif$" $DIRECTORIES | tee bad_style.log +grep -Hnir --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "#else$\|#endif$" $DIRECTORIES | tee bad_style.log if [[ $(cat bad_style.log | wc -l) > 0 ]] then echo "" @@ -60,7 +60,7 @@ then exit -1 fi -grep -Hnir --exclude-dir=assembly --exclude-dir=convolution "ARM_COMPUTE_AARCH64_V8_2" ./tests/validation/CL | tee bad_style.log +grep -Hnir --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "ARM_COMPUTE_AARCH64_V8_2" ./tests/validation/CL | tee bad_style.log if [[ $(cat bad_style.log | wc -l) > 0 ]] then echo "" diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py index d6deee9b68..e5793e5061 100755 --- a/scripts/clang_tidy_rules.py +++ b/scripts/clang_tidy_rules.py @@ -40,6 +40,9 @@ def filter_clang_tidy_lines( lines ): if "/assembly/" in line: continue + if "/arm_gemm/" in line: + continue + if "/convolution/" in line: continue @@ -90,6 +93,8 @@ def filter_clang_tidy_lines( lines ): ("parameter 'memory_manager' is copied for each invocation but only used as a const reference" in line) or ("DeconvolutionLayer.cpp" in line and "casting (double + 0.5) to integer leads to incorrect rounding; consider using lround" in line) or ("NEWinogradLayerKernel.cpp" in line and "use '= default' to define a trivial destructor" in line) or + ("NEGEMMLowpMatrixMultiplyCore.cpp" in line and "constructor does not initialize these fields" in line) or + ("NEGEMMLowpAssemblyMatrixMultiplyCore" in line and "constructor does not initialize these fields" in line) or "3rdparty" in line): print_context=False continue diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp index d4ce3888fd..5d1464ace4 100644 --- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp +++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp @@ -152,9 +152,9 @@ GCKernel::GCKernel(std::string name, GLuint program) ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_shader_params_ubo_name)); _shader_params_index = ARM_COMPUTE_GL_CHECK(glGetUniformBlockIndex(_program, _shader_params_name)); - ARM_COMPUTE_ERROR_ON_MSG((_shader_params_index == GL_INVALID_INDEX), "Failed to get index of %s", _shader_params_name); + ARM_COMPUTE_ERROR_ON_MSG(_shader_params_index == GL_INVALID_INDEX, "Failed to get index of %s", _shader_params_name); ARM_COMPUTE_GL_CHECK(glGetActiveUniformBlockiv(_program, _shader_params_index, GL_UNIFORM_BLOCK_DATA_SIZE, &_shader_params_size)); - ARM_COMPUTE_ERROR_ON_MSG((_shader_params_size == 0), "Failed to get size of %s", _shader_params_name); + ARM_COMPUTE_ERROR_ON_MSG(_shader_params_size == 0, "Failed to get size of %s", _shader_params_name); } void GCKernel::cleanup() diff --git a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp deleted file mode 100644 index bffcbbf436..0000000000 --- a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/AccessWindowTranspose.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "support/ToolchainSupport.h" - -namespace arm_compute -{ -#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp" -} // namespace arm_compute - -#include -#include -#include -#include - -namespace arm_compute -{ -void NEGEMMAArch32Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); - - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _is_transposed_0 = is_transposed_0; - _is_transposed_1 = is_transposed_1; - - // Configure kernel window - Window win = calculate_max_window(*output->info()); - - AccessWindowRectangle output_access(output->info(), 0, 0, 8, 6); - - const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 6); - const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 8); - - update_window_and_padding(win, - AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()), - AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()), - output_access); - - INEKernel::configure(win); -} - -void NEGEMMAArch32Kernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float); - const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float); - const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float); - - const auto in1_ptr = reinterpret_cast(_input1->buffer()); - - const int M = std::min(_output->info()->tensor_shape().y(), static_cast(window.y().end())) - window.y().start(); - const int N = _output->info()->tensor_shape().x(); - const int K = _input0->info()->tensor_shape().x(); - - // Only iterate over batches - Window win(window); - win.set(0, Window::Dimension(0, 1, 1)); - win.set(1, Window::Dimension(0, 1, 1)); - - Iterator in0(_input0, window); - Iterator out(_output, window); - - GemmInterleaved gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1); - constexpr size_t alignment = 4096; - const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; - void *workspace = _workspace->buffer() + offset; - size_t workspace_size = _workspace->info()->total_size(); - - if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr) - { - ARM_COMPUTE_ERROR("Not enough space to align buffer!"); - } - - execute_window_loop(win, [&](const Coordinates & id) - { - gemm.execute(reinterpret_cast(in0.ptr()), lda, - reinterpret_cast(in1_ptr), ldb, - reinterpret_cast(out.ptr()), ldc, - _alpha, _beta, workspace); - }, - in0, out); -} -} // namespace arm_compute diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp deleted file mode 100644 index 0eaa9aa39b..0000000000 --- a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/AccessWindowTranspose.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "support/ToolchainSupport.h" - -namespace arm_compute -{ -#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp" -} // namespace arm_compute - -#include -#include -#include -#include - -namespace arm_compute -{ -void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); - - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _is_transposed_0 = is_transposed_0; - _is_transposed_1 = is_transposed_1; - - // Configure kernel window - Window win = calculate_max_window(*output->info()); - - AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8); - - const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8); - const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12); - - update_window_and_padding(win, - AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()), - AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()), - output_access); - - INEKernel::configure(win); -} - -void NEGEMMAArch64Kernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float); - const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float); - const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float); - - const auto in1_ptr = reinterpret_cast(_input1->buffer()); - - const int M = std::min(_output->info()->tensor_shape().y(), static_cast(window.y().end())) - window.y().start(); - const int N = _output->info()->tensor_shape().x(); - const int K = _input0->info()->tensor_shape().x(); - - // Only iterate over batches - Window win(window); - win.set(0, Window::Dimension(0, 1, 1)); - win.set(1, Window::Dimension(0, 1, 1)); - - Iterator in0(_input0, window); - Iterator out(_output, window); - - GemmInterleaved gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1); - constexpr size_t alignment = 4096; - const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; - void *workspace = _workspace->buffer() + offset; - size_t workspace_size = _workspace->info()->total_size(); - - if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr) - { - ARM_COMPUTE_ERROR("Not enough space to align buffer!"); - } - - execute_window_loop(win, [&](const Coordinates & id) - { - gemm.execute(reinterpret_cast(in0.ptr()), lda, - reinterpret_cast(in1_ptr), ldb, - reinterpret_cast(out.ptr()), ldc, - _alpha, _beta, workspace); - }, - in0, out); -} -} // namespace arm_compute diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp deleted file mode 100644 index 0b3212bf55..0000000000 --- a/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/AccessWindowTranspose.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "support/ToolchainSupport.h" - -namespace arm_compute -{ -#include "arm_compute/core/NEON/kernels/convolution/winograd/gemm.hpp" -} // namespace arm_compute - -#include -#include -#include -#include - -namespace arm_compute -{ -void NEGEMMAArch64NativeKernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, - bool is_transposed_1) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); - - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _is_transposed_0 = is_transposed_0; - _is_transposed_1 = is_transposed_1; - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(16U, 4U)); - - const int input0_access_end_x = ceil_to_multiple(input0->info()->tensor_shape().x(), 4); - const int input0_access_end_y = ceil_to_multiple(input0->info()->tensor_shape().y(), 4); - const int input1_access_end_x = ceil_to_multiple(input1->info()->tensor_shape().x(), 16); - - AccessWindowStatic input0_access(input0->info(), 0, 0, input0_access_end_x, input0_access_end_y); - AccessWindowStatic input1_access(input1->info(), 0, 0, input1_access_end_x, input1->info()->tensor_shape().y()); - AccessWindowRectangle output_access(output->info(), 0, 0, 16, 4); - update_window_and_padding(win, input0_access, input1_access, output_access); - - INEKernel::configure(win); -} - -void NEGEMMAArch64NativeKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_UNUSED(info); - - const auto in1_ptr = reinterpret_cast(_input1->buffer()); - - // Calculate row strides for each matrix - const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float); - const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float); - const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float); - - // Calculate matrix sizes - const int M = std::min(_input0->info()->tensor_shape().y(), static_cast(window.y().end())) - window.y().start(); - const int K = _input0->info()->tensor_shape().x(); - const int N = _input1->info()->tensor_shape().x(); - - // Create window (Only iterate over batches) - Window win(window); - win.set(0, Window::Dimension(0, 1, 1)); - win.set(1, Window::Dimension(0, 1, 1)); - - // Create Iterators - Iterator in0(_input0, window); - Iterator out(_output, window); - - // Execute GEMM - execute_window_loop(win, [&](const Coordinates & id) - { - BlockedGemm<4, 16, float, float>(reinterpret_cast(in0.ptr()), - reinterpret_cast(in1_ptr), - reinterpret_cast(out.ptr()), - M, K, N, - lda, ldb, ldc); - }, - in0, out); -} -} // namespace arm_compute diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp deleted file mode 100644 index 80606dcc07..0000000000 --- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "support/ToolchainSupport.h" - -namespace arm_compute -{ -#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp" -} // namespace arm_compute - -#include -#include -#include - -// Enable only if compiled for AArch64-V8A targets -#ifdef ARM_COMPUTE_AARCH64_V8A - -namespace arm_compute -{ -NEGEMMLowpAArch64A53Kernel::NEGEMMLowpAArch64A53Kernel() - : _func(nullptr) -{ -} - -void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, - const Window &window, - const ThreadInfo &info) -{ - const int lda = input0->info()->strides_in_bytes().y(); - const int ldb = input1->info()->strides_in_bytes().y(); - const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t); - - const auto in1_ptr = reinterpret_cast(input1->buffer()); - - const int M = std::min(output->info()->tensor_shape().y(), static_cast(window.y().end())) - window.y().start(); - const int N = output->info()->tensor_shape().x(); - const int K = input0->info()->tensor_shape().x(); - - // Only iterate over batches - Window win(window); - win.set(0, Window::Dimension(0, 1, 1)); - win.set(1, Window::Dimension(0, 1, 1)); - - Iterator in0(input0, window); - Iterator out(output, window); - - GemmInterleaved gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1); - - constexpr size_t alignment = 4096; - const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; - void *_workspace = workspace->buffer() + offset; - size_t workspace_size = workspace->info()->total_size(); - - if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr) - { - ARM_COMPUTE_ERROR("Not enough space to align buffer!"); - } - - execute_window_loop(win, [&](const Coordinates & id) - { - gemm.execute(reinterpret_cast(in0.ptr()), lda, - reinterpret_cast(in1_ptr), ldb, - reinterpret_cast(out.ptr()), ldc, - alpha, beta, _workspace); - }, - in0, out); -} - -void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, - const Window &window, - const ThreadInfo &info) -{ - const int lda = input0->info()->strides_in_bytes().y(); - const int ldb = input1->info()->strides_in_bytes().y(); - const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t); - - const auto in1_ptr = reinterpret_cast(input1->buffer()); - - const int M = std::min(output->info()->tensor_shape().y(), static_cast(window.y().end())) - window.y().start(); - const int N = output->info()->tensor_shape().x(); - const int K = input0->info()->tensor_shape().x(); - - // Only iterate over batches - Window win(window); - win.set(0, Window::Dimension(0, 1, 1)); - win.set(1, Window::Dimension(0, 1, 1)); - - Iterator in0(input0, window); - Iterator out(output, window); - - GemmInterleaved gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1); - - constexpr size_t alignment = 4096; - const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; - void *_workspace = workspace->buffer() + offset; - size_t workspace_size = workspace->info()->total_size(); - - if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr) - { - ARM_COMPUTE_ERROR("Not enough space to align buffer!"); - } - - execute_window_loop(win, [&](const Coordinates & id) - { - gemm.execute(reinterpret_cast(in0.ptr()), lda, - reinterpret_cast(in1_ptr), ldb, - reinterpret_cast(out.ptr()), ldc, - alpha, beta, _workspace); - }, - in0, out); -} - -void NEGEMMLowpAArch64A53Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, - bool is_transposed_1) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _is_transposed_0 = is_transposed_0; - _is_transposed_1 = is_transposed_1; - - switch(input0->info()->data_type()) - { - case DataType::S8: - _func = &gemm_interleaved_s16_12x8; - break; - case DataType::U8: - _func = &gemm_interleaved_u16_12x8; - break; - default: - ARM_COMPUTE_ERROR("Element size not supported"); - break; - } - - // Configure kernel window - Window win = calculate_max_window(*output->info()); - - AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8); - - const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 12); - const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12); - - update_window_and_padding(win, - AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()), - AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()), - output_access); - - INEKernel::configure(win); -} - -void NEGEMMLowpAArch64A53Kernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info); -} -} // namespace arm_compute -#endif /* ARM_COMPUTE_AARCH64_V8A */ diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp deleted file mode 100644 index 38f82f0407..0000000000 --- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "support/ToolchainSupport.h" - -namespace arm_compute -{ -#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp" -} // namespace arm_compute - -#include -#include -#include - -// Enable only if compiled for AArch64-V8A targets -#ifdef ARM_COMPUTE_AARCH64_V8A - -namespace arm_compute -{ -NEGEMMLowpAArch64Kernel::NEGEMMLowpAArch64Kernel() - : _func(nullptr) -{ -} - -void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window, - const ThreadInfo &info) -{ - const int lda = input0->info()->strides_in_bytes().y(); - const int ldb = input1->info()->strides_in_bytes().y(); - const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t); - - const auto in1_ptr = reinterpret_cast(input1->buffer()); - - const int M = std::min(output->info()->tensor_shape().y(), static_cast(window.y().end())) - window.y().start(); - const int N = output->info()->tensor_shape().x(); - const int K = input0->info()->tensor_shape().x(); - - // Only iterate over batches - Window win(window); - win.set(0, Window::Dimension(0, 1, 1)); - win.set(1, Window::Dimension(0, 1, 1)); - - Iterator in0(input0, window); - Iterator out(output, window); - - GemmInterleaved gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1); - - constexpr size_t alignment = 4096; - const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; - void *_workspace = workspace->buffer() + offset; - size_t workspace_size = workspace->info()->total_size(); - - if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr) - { - ARM_COMPUTE_ERROR("Not enough space to align buffer!"); - } - - execute_window_loop(win, [&](const Coordinates & id) - { - gemm.execute(reinterpret_cast(in0.ptr()), lda, - reinterpret_cast(in1_ptr), ldb, - reinterpret_cast(out.ptr()), ldc, - alpha, beta, _workspace); - }, - in0, out); -} - -void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window, - const ThreadInfo &info) -{ - const int lda = input0->info()->strides_in_bytes().y(); - const int ldb = input1->info()->strides_in_bytes().y(); - const int ldc = output->info()->strides_in_bytes().y() / sizeof(uint32_t); - - const auto in1_ptr = reinterpret_cast(input1->buffer()); - - const int M = std::min(output->info()->tensor_shape().y(), static_cast(window.y().end())) - window.y().start(); - const int N = output->info()->tensor_shape().x(); - const int K = input0->info()->tensor_shape().x(); - - // Only iterate over batches - Window win(window); - win.set(0, Window::Dimension(0, 1, 1)); - win.set(1, Window::Dimension(0, 1, 1)); - - Iterator in0(input0, window); - Iterator out(output, window); - - GemmInterleaved gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1); - - constexpr size_t alignment = 4096; - const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; - void *_workspace = workspace->buffer() + offset; - size_t workspace_size = workspace->info()->total_size(); - - if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr) - { - ARM_COMPUTE_ERROR("Not enough space to align buffer!"); - } - - execute_window_loop(win, [&](const Coordinates & id) - { - gemm.execute(reinterpret_cast(in0.ptr()), lda, - reinterpret_cast(in1_ptr), ldb, - reinterpret_cast(out.ptr()), ldc, - alpha, beta, _workspace); - }, - in0, out); -} - -void NEGEMMLowpAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, - bool is_transposed_1) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::U32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _is_transposed_0 = is_transposed_0; - _is_transposed_1 = is_transposed_1; - - switch(input0->info()->data_type()) - { - case DataType::S8: - _func = &gemm_interleaved_s8; - break; - case DataType::U8: - _func = &gemm_interleaved_u8; - break; - default: - ARM_COMPUTE_ERROR("Element size not supported"); - break; - } - - // Configure kernel window - Window win = calculate_max_window(*output->info()); - - AccessWindowRectangle output_access(output->info(), 0, 0, 4, 4); - - const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 4); - const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 4); - - update_window_and_padding(win, - AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()), - AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()), - output_access); - - INEKernel::configure(win); -} - -void NEGEMMLowpAArch64Kernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info); -} -} // namespace arm_compute -#endif /* ARM_COMPUTE_AARCH64_V8A */ diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp deleted file mode 100644 index d4fcf5e3cb..0000000000 --- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "support/ToolchainSupport.h" - -namespace arm_compute -{ -#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp" -} // namespace arm_compute - -#include -#include -#include - -// Enable only if compiled for AArch64-V8.2-A targets -#ifdef ARM_COMPUTE_AARCH64_V8_2 - -namespace -{ -using namespace arm_compute; - -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::U8, DataType::S8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output) -{ - // Configure kernel window - Window win = calculate_max_window(*output); - - AccessWindowRectangle output_access(output, 0, 0, 12, 8); - - const int input0_access_end = ceil_to_multiple(input0->tensor_shape().x(), 8); - const int input1_access_end = ceil_to_multiple(input1->tensor_shape().x(), 12); - - bool window_changed = update_window_and_padding(win, - AccessWindowStatic(input0, 0, 0, input0_access_end, input0->tensor_shape().y()), - AccessWindowStatic(input1, 0, 0, input1_access_end, input1->tensor_shape().y()), - output_access); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} - -template -void *align_workspace(GemmInterleaved &gemm, const ThreadInfo &info, ITensor *ws) -{ - constexpr size_t alignment = 4096; - const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; - void *workspace = ws->buffer() + offset; - size_t workspace_size = ws->info()->total_size(); - - if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr) - { - ARM_COMPUTE_ERROR("Not enough space to align buffer!"); - } - return workspace; -} - -template -void execute_gemm(const Window &win, Iterator &in0, Iterator &in1, Iterator &out, - const ThreadInfo &info, ITensor *ws, int M, int N, int K, bool is_transposed_0, bool is_transposed_1, - int lda, int ldb, int ldc, float alpha, float beta) -{ - ARM_COMPUTE_UNUSED(M); - ARM_COMPUTE_UNUSED(N); - ARM_COMPUTE_UNUSED(K); - ARM_COMPUTE_UNUSED(is_transposed_0); - ARM_COMPUTE_UNUSED(is_transposed_1); - GemmInterleaved gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1); - void *workspace = align_workspace(gemm, info, ws); - execute_window_loop(win, [&](const Coordinates & id) - { - gemm.execute(reinterpret_cast(in0.ptr()), lda, - reinterpret_cast(in1.ptr()), ldb, - reinterpret_cast(out.ptr()), ldc, - alpha, beta, workspace); - }, - in0, out); -} -} // namespace - -namespace arm_compute -{ -void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, - bool is_transposed_1) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info())); - - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _is_transposed_0 = is_transposed_0; - _is_transposed_1 = is_transposed_1; - - // Configure kernel window - auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); -} - -Status NEGEMMLowpAArch64V8P4Kernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first); - - return Status{}; -} - -void NEGEMMLowpAArch64V8P4Kernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - const int lda = _input0->info()->strides_in_bytes().y(); - const int ldb = _input1->info()->strides_in_bytes().y(); - const int ldc = _output->info()->strides_in_bytes().y() / sizeof(uint32_t); - - const int M = std::min(_output->info()->tensor_shape().y(), static_cast(window.y().end())) - window.y().start(); - const int N = _output->info()->tensor_shape().x(); - const int K = _input0->info()->tensor_shape().x(); - - // Only iterate over batches - Window win(window); - win.set(0, Window::Dimension(0, 1, 1)); - win.set(1, Window::Dimension(0, 1, 1)); - - Iterator in0(_input0, window); - Iterator in1(_input1, window); - Iterator out(_output, window); - - switch(_input0->info()->data_type()) - { - case DataType::QASYMM8: - case DataType::U8: - { - execute_gemm(win, in0, in1, out, info, _workspace, M, N, K, _is_transposed_0, _is_transposed_1, lda, ldb, ldc, _alpha, _beta); - break; - } - case DataType::S8: - { - execute_gemm(win, in0, in1, out, info, _workspace, M, N, K, _is_transposed_0, _is_transposed_1, lda, ldb, ldc, _alpha, _beta); - break; - } - default: - { - ARM_COMPUTE_ERROR("Not supported."); - break; - } - } -} -} // namespace arm_compute -#endif /* ARM_COMPUTE_AARCH64_V8_2 */ diff --git a/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp deleted file mode 100644 index 163014b04f..0000000000 --- a/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/AccessWindowTranspose.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "support/ToolchainSupport.h" - -namespace arm_compute -{ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-default" -#pragma GCC diagnostic ignored "-Weffc++" -#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp" -#pragma GCC diagnostic pop -} // namespace arm_compute - -#include -#include -#include -#include - -namespace arm_compute -{ -void NEGEMVAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); - - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _is_transposed_0 = is_transposed_0; - _is_transposed_1 = is_transposed_1; - - // Configure kernel window - Window win = calculate_max_window(*output->info()); - - AccessWindowRectangle output_access(output->info(), 0, 0, 12, 1); - - const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 12); - const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12); - - update_window_and_padding(win, - AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()), - AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()), - output_access); - - INEKernel::configure(win); -} - -void NEGEMVAArch64Kernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - const int lda = _input0->info()->strides_in_bytes().y() / sizeof(sgemv_trans::operand_type); - const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(sgemv_trans::operand_type); - const int ldc = _output->info()->strides_in_bytes().y() / sizeof(sgemv_trans::result_type); - - const auto in1_ptr = reinterpret_cast(_input1->buffer()); - - const int N = _output->info()->tensor_shape().x(); - const int K = _input0->info()->tensor_shape().x(); - - // Only iterate over batches - Window win(window); - win.set(0, Window::Dimension(0, 1, 1)); - win.set(1, Window::Dimension(0, 1, 1)); - - Iterator in0(_input0, window); - Iterator out(_output, window); - - GemvTransposed gemm(&info.cpu_info, N, K); - constexpr size_t alignment = 4096; - const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; - void *workspace = _workspace->buffer() + offset; - size_t workspace_size = _workspace->info()->total_size(); - - if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr) - { - ARM_COMPUTE_ERROR("Not enough space to align buffer!"); - } - - execute_window_loop(win, [&](const Coordinates & id) - { - gemm.execute(reinterpret_cast(in0.ptr()), lda, - reinterpret_cast(in1_ptr), ldb, - reinterpret_cast(out.ptr()), ldc, - _alpha, _beta, workspace); - }, - in0, out); -} -} // namespace arm_compute diff --git a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp deleted file mode 100644 index e84409cfd2..0000000000 --- a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/AccessWindowTranspose.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "support/ToolchainSupport.h" - -namespace arm_compute -{ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-default" -#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp" -#pragma GCC diagnostic pop -} // namespace arm_compute - -namespace arm_compute -{ -void NEHGEMMAArch64FP16Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, - bool is_transposed_1) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); - - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _is_transposed_0 = is_transposed_0; - _is_transposed_1 = is_transposed_1; - - // Configure kernel window - Window win = calculate_max_window(*output->info()); - - AccessWindowRectangle output_access(output->info(), 0, 0, 24, 8); - - const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8); - const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 24); - - update_window_and_padding(win, - AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()), - AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()), - output_access); - - INEKernel::configure(win); -} - -void NEHGEMMAArch64FP16Kernel::run(const Window &window, const ThreadInfo &info) -{ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - const int lda = _input0->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::operand_type); - const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::operand_type); - const int ldc = _output->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::result_type); - - const auto in1_ptr = reinterpret_cast(_input1->buffer()); - - const int M = std::min(_output->info()->tensor_shape().y(), static_cast(window.y().end())) - window.y().start(); - const int N = _output->info()->tensor_shape().x(); - const int K = _input0->info()->tensor_shape().x(); - - // Only iterate over batches - Window win(window); - win.set(0, Window::Dimension(0, 1, 1)); - win.set(1, Window::Dimension(0, 1, 1)); - - Iterator in0(_input0, window); - Iterator out(_output, window); - - GemmInterleaved gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1); - constexpr size_t alignment = 4096; - const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; - void *workspace = _workspace->buffer() + offset; - size_t workspace_size = _workspace->info()->total_size(); - - if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr) - { - ARM_COMPUTE_ERROR("Not enough space to align buffer!"); - } - - execute_window_loop(win, [&](const Coordinates & id) - { - gemm.execute(reinterpret_cast(in0.ptr()), lda, - reinterpret_cast(in1_ptr), ldb, - reinterpret_cast(out.ptr()), ldc, - _alpha, 1.f, workspace); - }, - in0, out); -#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - ARM_COMPUTE_UNUSED(window); - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16."); -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -} -} // namespace arm_compute diff --git a/src/core/NEON/kernels/arm_gemm/asmlib.hpp b/src/core/NEON/kernels/arm_gemm/asmlib.hpp new file mode 100644 index 0000000000..b3fcb33bfb --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/asmlib.hpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ +// Macro to use in assembler to get a preload. Needed because of various +// workarounds needed to get working preload behaviour. +// +// Code using these macros needs to clobber x20 and x21 as they might be +// used by the workaround. + +// "Correct" version +#define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n" +#define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n" +#define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n" +#define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n" + +// Lee's uarchsim hack +//#define ASM_PREFETCH(address) "LDNP x20, x21, " address "\n" + +// No preload at all +//#define ASM_PREFETCH(address) "" +#else + +// "Correct" versions for AArch32 +#define ASM_PREFETCH(address) "PLD " address "\n" +#define ASM_PREFETCHW(address) "PLDW " address "\n" + +#endif + +/* + * Do some prefetches. + */ +template +static inline void prefetch_6x(const T *pfp) +{ + __asm __volatile( + ASM_PREFETCH("[%[pfp]]") + ASM_PREFETCH("[%[pfp], #64]") + ASM_PREFETCH("[%[pfp], #128]") + ASM_PREFETCH("[%[pfp], #192]") + ASM_PREFETCH("[%[pfp], #256]") + ASM_PREFETCH("[%[pfp], #320]") + : + : [pfp] "r"(pfp) + : "memory"); +} + +template +static inline void prefetch_5x(const T *pfp) +{ + __asm __volatile( + ASM_PREFETCH("[%[pfp]]") + ASM_PREFETCH("[%[pfp], #64]") + ASM_PREFETCH("[%[pfp], #128]") + ASM_PREFETCH("[%[pfp], #192]") + ASM_PREFETCH("[%[pfp], #256]") + : + : [pfp] "r"(pfp) + : "memory"); +} + +template +static inline void prefetch_4x(const T *pfp) +{ + __asm __volatile( + ASM_PREFETCH("[%[pfp]]") + ASM_PREFETCH("[%[pfp], #64]") + ASM_PREFETCH("[%[pfp], #128]") + ASM_PREFETCH("[%[pfp], #192]") + : + : [pfp] "r"(pfp) + : "memory"); +} + +template +static inline void prefetch_3x(const T *pfp) +{ + __asm __volatile( + ASM_PREFETCH("[%[pfp]]") + ASM_PREFETCH("[%[pfp], #64]") + ASM_PREFETCH("[%[pfp], #128]") + : + : [pfp] "r"(pfp) + : "memory"); +} + +template +static inline void prefetch_2x(const T *pfp) +{ + __asm __volatile( + ASM_PREFETCH("[%[pfp]]") + ASM_PREFETCH("[%[pfp], #64]") + : + : [pfp] "r"(pfp) + : "memory"); +} + +template +static inline void prefetch_1x(const T *pfp) +{ + __asm __volatile( + ASM_PREFETCH("[%[pfp]]") + : + : [pfp] "r"(pfp) + : "memory"); +} diff --git a/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp new file mode 100644 index 0000000000..dd74744ebc --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include +#include + +#ifndef NO_MULTI_THREADING +#include +#include + +#define USE_SEMAPHORE + +#ifdef USE_SEMAPHORE +#include +#endif + +#endif + +namespace arm_gemm +{ +#ifndef NO_MULTI_THREADING +enum class BufferStatus +{ + IDLE, + POPULATING, + BUSY +}; + +class Buffer +{ +private: + const int _maxusers; // Maximum permissible threads. + void *const _storage; // Storage for buffer content. + + int _numusers; // Actual number of threads (might be lower). + + volatile BufferStatus _status = BufferStatus::IDLE; // Status + std::atomic_int _users = {}; // How many users are still using the buffer. + volatile int _index = 0; // Which block of data currently resides in the buffer. + + std::mutex _lock = {}; +#ifdef USE_SEMAPHORE + std::condition_variable _cv = {}; +#endif + + template + void populate_buffer(T func) + { + func(_storage); + + /* Now mark it as ready. */ +#ifdef USE_SEMAPHORE + { + std::unique_lock ul(_lock); + _status = BufferStatus::BUSY; + _cv.notify_all(); + } +#else + _status = BufferStatus::BUSY; +#endif + } + +public: + Buffer(Buffer &) = delete; + Buffer &operator=(Buffer &) = delete; + + Buffer(void *storage, int maxusers) + : _maxusers(maxusers), _storage(storage), _numusers(maxusers) + { + _status = BufferStatus::IDLE; + } + + /* Try and populate the given index. + * Wait if the buffer is busy with previous index, then: + * + * If the buffer is idle, grab it and populate it. + * If it's already being populated by another thread or is ready, return. + */ + template + void try_populate(const int index, T func) + { + for(;;) + { +#ifdef USE_SEMAPHORE + /* If it's busy with a previous index, wait on the semaphore. */ + if((_status == BufferStatus::BUSY) && (_index != index)) + { + std::unique_lock ul(_lock); + + if((_status == BufferStatus::BUSY) && (_index != index)) + { + _cv.wait(ul); + } + } +#endif + /* Return if another thread is populating it already. */ + if((_index == index) && ((_status == BufferStatus::POPULATING) || (_status == BufferStatus::BUSY))) + { + return; + } + + if(_status == BufferStatus::IDLE) + { + std::lock_guard guard(_lock); + + /* If the buffer is still idle, we can grab it and populate it. */ + if(_status == BufferStatus::IDLE) + { + _status = BufferStatus::POPULATING; + _index = index; + _users = _numusers; + break; + } + } + } + + /* If we get here, fill in the buffer. */ + populate_buffer(func); + } + + template + void *get(const int index, T func) + { + // Loop until we achieve something. + for(;;) + { + // If the index is correct and the buffer status is busy then we can + // just return the content. No locking is needed here as the index + // cannot change (and status cannot change from BUSY) until all + // users have finished. + if((_index == index) && (_status == BufferStatus::BUSY)) + { + return _storage; + } +#ifdef USE_SEMAPHORE + if(((_status == BufferStatus::BUSY) && (_index != index)) || (_status == BufferStatus::POPULATING)) + { + std::unique_lock ul(_lock); + + if(((_status == BufferStatus::BUSY) && (_index != index)) || (_status == BufferStatus::POPULATING)) + { + _cv.wait(ul); + } + } +#endif + + // If it's idle, we need to populate it. The IDLE->POPULATING + // transition requires the lock. + if(_status == BufferStatus::IDLE) + { + std::lock_guard guard(_lock); + + /* If it's still idle, grab it. Otherwise drop through and + * we'll do something else next time through the loop. */ + if(_status == BufferStatus::IDLE) + { + _status = BufferStatus::POPULATING; + _index = index; + _users = _numusers; + break; + } + } + } + + /* If we get here we need to populate the buffer. */ + populate_buffer(func); + + return _storage; + } + + /* Threads call this when they have finished processing a buffer. We + * simply (atomically) decrement the user count, and if it's hit zero we + * flag the buffer as idle. + */ + void release(void) + { + if(--_users == 0) + { +#ifdef USE_SEMAPHORE + std::unique_lock ul(_lock); + _status = BufferStatus::IDLE; + /* We notify all waiters as we expect one to do the populating + * and any others to go and process and earlier block. */ + _cv.notify_all(); +#else + _status = BufferStatus::IDLE; +#endif + } + } + + /* This is called to change the number of users. */ + void set_numusers(int numusers) + { + _numusers = std::min(numusers, _maxusers); + } +}; + +class BufferManager +{ +private: + /* This has to be a vector of Buffer *, because a Buffer cannot be moved + * or copied due to atomic members. */ + std::vector _buffers = {}; + const int _maxthreads; + void *const _storage; + +public: + BufferManager(BufferManager &) = delete; + BufferManager &operator=(BufferManager &) = delete; + + // Say how much storage is needed. + static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize) + { + return buffersize * ((maxthreads == 1) ? 1 : 3); + } + + BufferManager(const int maxthreads, const size_t buffersize, void *storage) + : _maxthreads(maxthreads), _storage(storage) + { + const int numbuffers = (maxthreads == 1) ? 1 : 3; + + /* We don't need any Buffer objects in single thread mode. */ + if(_maxthreads == 1) + { + return; + } + + /* Use intptr_t to avoid performing arithmetic on a void * */ + intptr_t storage_int = reinterpret_cast(_storage); + + for(int i = 0; i < numbuffers; i++) + { + _buffers.push_back(new Buffer(reinterpret_cast(storage_int), _maxthreads)); + storage_int += buffersize; + } + } + + ~BufferManager() + { + while(_buffers.size()) + { + delete _buffers.back(); + _buffers.pop_back(); + } + } + + template + void *get(const int index, T func) + { + /* In single thread mode, we just directly call the populating + * function on the (single) buffer, otherwise forward to the + * relevant Buffer. */ + if(_maxthreads == 1) + { + func(_storage); + return _storage; + } + else + { + return _buffers[index % _buffers.size()]->get(index, func); + } + } + + template + void try_populate(const int index, T func) + { + /* No need for this in single thread mode. */ + if(_maxthreads == 1) + { + return; + } + + _buffers[index % _buffers.size()]->try_populate(index, func); + } + + void release(const int index) + { + /* No need for this in single thread mode. */ + if(_maxthreads == 1) + { + return; + } + + _buffers[index % _buffers.size()]->release(); + } + + void set_nthreads(int threads) + { + if(_maxthreads == 1) + { + return; + } + + for(unsigned int i = 0; i < _buffers.size(); i++) + { + _buffers[i]->set_numusers(threads); + } + } +}; + +#else + +/* Trivial implementation if threading is disabled at compile time. + * + * Here, we only need storage for a single buffer. The 'get' method needs + * to call the supplied function to populate the buffer and then return it. + * All the other methods do nothing. + */ + +class BufferManager +{ +private: + void *const _storage; + +public: + BufferManager(BufferManager &) = delete; + BufferManager &operator=(BufferManager &) = delete; + + BufferManager(const int maxthreads, const size_t buffersize, void *storage) + : _storage(storage) + { + } + + ~BufferManager() + { + } + + // Say how much storage is needed. + static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize) + { + return buffersize; + } + + template + void try_populate(const int index, T func) + { + } + + void release(const int index) + { + } + + template + void *get(const int index, T func) + { + func(_storage); + return _storage; + } + + void set_nthreads(int) + { + } +}; + +#endif + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp new file mode 100644 index 0000000000..b9729d4c5c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC + +#include "arm_gemm.hpp" + +#include "gemm_common.hpp" +#include "gemm_interleaved.hpp" + +#include "kernels/a32_sgemm_8x6.hpp" +#include "kernels/a64_hgemm_24x8.hpp" +#include "kernels/a64_sgemm_12x8.hpp" + +namespace arm_gemm +{ +template <> +UniqueGemmCommon<__fp16, __fp16> gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K, + const bool trA, const bool trB, const __fp16 alpha, const __fp16 beta, + const int maxthreads, const bool pretransposed_hint) +{ +#ifdef __aarch64__ + /* If FP16 is supported, use it */ + if(ci.has_fp16()) + { + return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint)); + } + + /* Fallback to using the blocked SGEMM kernel. */ + return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint)); +#else + /* For AArch32, only support the SGEMM route. */ + return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint)); +#endif +} + +// Instantiate static class members +#ifdef __aarch64__ +const int hgemm_24x8::out_width; +const int hgemm_24x8::out_height; +#endif + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_FP16_SCALAR_ARITHMETIC diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp new file mode 100644 index 0000000000..1baa21fd1b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_gemm.hpp" +#include "gemm_common.hpp" +#include "gemm_interleaved.hpp" +#include "gemm_native.hpp" +#include "gemv_native_transposed.hpp" +#include "gemv_pretransposed.hpp" + +#include "kernels/a32_sgemm_8x6.hpp" +#include "kernels/a64_sgemm_12x8.hpp" +#include "kernels/a64_sgemm_native_16x4.hpp" +#include "kernels/a64_sgemv_pretransposed.hpp" +#include "kernels/a64_sgemv_trans.hpp" + +namespace arm_gemm +{ +template <> +UniqueGemmCommon gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K, + const bool trA, const bool trB, const float alpha, const float beta, + const int maxthreads, const bool pretransposed_hint) +{ +#ifdef __aarch64__ + /* Cases in priority order */ + /* GemvPretransposed: requires M=1, alpha=1, and transposed hint set */ + if(M == 1 && alpha == 1.0f && pretransposed_hint) + { + return UniqueGemmCommon(new GemvPretransposed(&ci, N, K, trB, beta)); + } + + /* GemvNativeTransposed: requires M=1, no trA or trB, doesn't handle beta */ + if(M == 1 && beta == 1.0f && !trA && !trB) + { + return UniqueGemmCommon(new GemvNativeTransposed(&ci, N, K, alpha)); + } + + /* Native GEMM: requires M to be a multiple of 4, K a multiple of 4, N a + * multiple of 16, doesn't handle alpha and only makes sense for small + * sizes. */ + if(N <= 128 && K <= 128 && ((M % 4) == 0) && ((K % 4) == 0) && ((N % 16) == 0) && alpha == 1.0f) + { + return UniqueGemmCommon(new GemmNative(&ci, M, N, K, beta)); + } + + /* Blocked GEMM, handles all cases. */ + return UniqueGemmCommon(new GemmInterleaved(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint)); +#else + return UniqueGemmCommon(new GemmInterleaved(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint)); +#endif +} + +// Instantiate static class variables. +#ifdef __aarch64__ +const int sgemm_12x8::out_width; +const int sgemm_12x8::out_height; + +const int sgemm_native_16x4::out_width; +const int sgemm_native_16x4::out_height; +#else +const int sgemm_8x6::out_width; +const int sgemm_8x6::out_height; +#endif + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp new file mode 100644 index 0000000000..344bfed12b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "gemm_common.hpp" +#include "gemm_interleaved.hpp" + +#include "kernels/a64_gemm_s16_12x8.hpp" + +namespace arm_gemm +{ +template <> +UniqueGemmCommon gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K, + const bool trA, const bool trB, const int32_t alpha, const int32_t beta, + const int maxthreads, const bool pretransposed_hint) +{ + return UniqueGemmCommon(new GemmInterleaved(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint)); +} + +// Instantiate static class members +const int gemm_s16_12x8::out_width; +const int gemm_s16_12x8::out_height; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp new file mode 100644 index 0000000000..856d407cfa --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "gemm_common.hpp" +#include "gemm_interleaved.hpp" + +#include "kernels/a64_gemm_s16_12x8.hpp" +#include "kernels/a64_gemm_s8_12x8.hpp" +#include "kernels/a64_gemm_s8_4x4.hpp" + +namespace arm_gemm +{ +template <> +UniqueGemmCommon gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K, + const bool trA, const bool trB, const int32_t alpha, const int32_t beta, + const int maxthreads, const bool pretransposed_hint) +{ + if(ci.has_dotprod()) + { + // Dot product supporting CPUs. This family has a special version for A55r1. + return UniqueGemmCommon(new GemmInterleaved(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint)); + } + + return UniqueGemmCommon(new GemmInterleaved(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint)); + + // TODO: There's a better approach for A53, but it doesn't work + // well on heterogeneous systems as the required data formats + // are different. Figure out how to enable this: + // gemm = new GemmInterleaved(ci, M, N, K, trA, trB); +} + +// Instantiate static class members +const int gemm_s8_12x8::out_width; +const int gemm_s8_12x8::out_height; +const int gemm_s8_4x4::out_width; +const int gemm_s8_4x4::out_height; + +} // namespace arm_gemm + +#endif // aarch64 diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp new file mode 100644 index 0000000000..27e4e8d411 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -0,0 +1,535 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include +#include + +#include + +#include "arm_gemm.hpp" +#include "utils.hpp" + +#include "buffer_manager.hpp" +#include "mergeresults.hpp" +#include "profiler.hpp" +#include "transform.hpp" + +// Some macros used to decide how much working space to allocate. +// Round allocations up to the next cache line. +#define ALLOC_ROUND 64 +#define ROUND_UP(x) ((((x) + ALLOC_ROUND - 1) / ALLOC_ROUND) * ALLOC_ROUND) + +// Implementation of the GemmCommon abstract class. +// +// This implementation interleaves the source matrices in blocks - good for +// larger matrices. +namespace arm_gemm +{ +template +class GemmInterleaved : public GemmCommon +{ + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + /* const properties set by constructor */ + const CPUInfo *const _ci; + + const unsigned int _Msize; + const unsigned int _Nsize; + const unsigned int _Ksize; + + const bool _trA; + const bool _trB; + + const Tr _alpha; + const Tr _beta; + + const unsigned int _maxthreads; + const bool _pretransposed; + + /* Blocking info */ + unsigned int _k_block = 0; + unsigned int _x_block = 0; + unsigned int _Mround = 0; + + /* Working space, pretransposed buffer, buffer manager */ + const Toi *_B_transposed = nullptr; + BufferManager *_bm = nullptr; + void *_working_space = nullptr; + + /* We will need to walk through the blocks of B in a few contexts, so + * factor that out. */ + class blockwalker + { + private: + /* Loop parameters, we only block up N and K so don't worry about M. */ + const unsigned int _Nsize, _Ksize, _x_block, _k_block; + + /* K and X parameters for current iteration. */ + unsigned int _k0 = 0, _x0 = 0; + + unsigned int _index = 0; + bool _done = false; + bool _newkblock = true; + + public: + blockwalker(const unsigned int K, const unsigned int k_block, const unsigned int N, const unsigned int x_block) + : _Nsize(N), _Ksize(K), _x_block(x_block), _k_block(k_block) + { + } + + unsigned int xmax() + { + return std::min(_x0 + _x_block, _Nsize); + } + + unsigned int kmax() + { + return std::min(_k0 + _k_block, _Ksize); + } + + /* Advance to the next block, return false at the end. */ + bool advance(void) + { + if(_done) + { + return false; + } + + _newkblock = false; + _x0 += _x_block; + if(_x0 >= _Nsize) + { + _x0 = 0; + _k0 += _k_block; + if(_k0 >= _Ksize) + { + _done = true; + return false; + } + _newkblock = true; + } + _index++; + + return true; + } + + unsigned int k0(void) + { + return _k0; + } + unsigned int x0(void) + { + return _x0; + } + unsigned int index(void) + { + return _index; + } + bool done(void) + { + return _done; + } + bool newkblock(void) + { + return _newkblock; + } + }; + + // A working size: One of these needed, regardless of thread count. Divided according to window. + size_t get_a_working_size() const + { + return ROUND_UP(sizeof(Toi) * _k_block * _Mround); + } + + // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings. + size_t get_b_working_size() const + { + return ROUND_UP(sizeof(Toi) * _x_block * _k_block); + } + + // C working size: One needed per thread. + size_t get_c_working_size() const + { + return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height); + } + + // Internal execute function. + // This supports both the "pretransposed" and "standard" interfaces via the template parameter. + template + void execute_internal(unsigned int start, unsigned int end, int threadid) + { + profiler prof; + strategy strat(_ci); + + blockwalker current(_Ksize, _k_block, _Nsize, _x_block); + blockwalker next = current; + + /* Compute the M values to operate on */ + unsigned int m_0 = start * strategy::out_height; + unsigned int m_max = std::min(end * strategy::out_height, _Msize); + + /* Make sure we've been set up correctly. */ + if(pretransposed) + { + assert(_B_transposed); + } + else + { + assert(_bm); + } + + assert(_working_space); + int8_t *working_space_bytes = reinterpret_cast(_working_space); + + // Private buffers. Treat working_space as an array of C buffers (one per thread) first, followed by the (window-divided) A buffer. + Toi *const a_panel = reinterpret_cast(working_space_bytes + (_maxthreads * get_c_working_size()) + (m_0 * _k_block * sizeof(Toi))); + Tri *const c_panel = reinterpret_cast(working_space_bytes + (threadid * get_c_working_size())); + + // Shared buffers - these come either from BufferManager or _B_transposed. + const Toi *b_panel; + + if(pretransposed) + { + b_panel = _B_transposed; + } + + //printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block); + + // newkblock() is always true on the first iteration, so this will be set properly on the first loop. + int kern_k = 0; + + for(; !current.done(); current.advance()) + { + if(current.newkblock()) + { + prof(PROFILE_PREPA, ((m_max - m_0) * (current.kmax() - current.k0()) * sizeof(Toi)), [&](void) + { + if(_trA ^ strategy::A_transpose) + { + Transform(a_panel, this->_Aptr, this->_lda, m_0, m_max, current.k0(), current.kmax()); + } + else + { + Transform(a_panel, this->_Aptr, this->_lda, m_0, m_max, current.k0(), current.kmax()); + } + }); + + // Figure out how many "K" the kernel will actually process. + kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll); + kern_k *= strat.k_unroll; + } + + int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width); + + if(!pretransposed) + { + /* Look ahead to the next block and populate it if necessary. + * This avoids the populate operation becoming a bottleneck, and + * helps keep the threads synchronized (the first thread to get + * here will populate while the rest will advance). + * + * If we are running single threaded, bm->try_populate() will do + * nothing. + */ + if(next.advance()) + { + _bm->try_populate(next.index(), [&](void *buffer) + { + prof(PROFILE_PREPB, (next.xmax() - next.x0()) * (next.kmax() - next.k0()) * sizeof(Toi), [&](void) + { + Toi *b_panel = reinterpret_cast(buffer); + if(_trB ^ strategy::B_transpose) + { + Transform(b_panel, this->_Bptr, this->_ldb, next.x0(), next.xmax(), next.k0(), next.kmax()); + } + else + { + Transform(b_panel, this->_Bptr, this->_ldb, next.x0(), next.xmax(), next.k0(), next.kmax()); + } + }); + }); + } + + /* Get the buffer for this iteration from the BufferManager. */ + b_panel = reinterpret_cast(_bm->get(current.index(), [&](void *bpv) + { + prof(PROFILE_PREPB, (current.xmax() - current.x0()) * (current.kmax() - current.k0()) * sizeof(Toi), [&](void) + { + Toi *b_panel = reinterpret_cast(bpv); + if(_trB ^ strategy::B_transpose) + { + Transform(b_panel, this->_Bptr, this->_ldb, current.x0(), current.xmax(), current.k0(), current.kmax()); + } + else + { + Transform(b_panel, this->_Bptr, this->_ldb, current.x0(), current.xmax(), current.k0(), current.kmax()); + } + }); + })); + } + + /* Do the actual work. */ + for(unsigned int y = m_0; y < m_max; y += strategy::out_height) + { + unsigned int ymax = std::min(_Msize, y + strategy::out_height); + + prof(PROFILE_KERNEL, (strategy::out_height * bblocks * strategy::out_width * kern_k), [&](void) + { + strat.kernel(a_panel + ((y - m_0) * kern_k), b_panel, c_panel, 1, bblocks, kern_k); + }); + prof(PROFILE_MERGE, (strategy::out_height * bblocks * strategy::out_width * sizeof(Tr)), [&](void) + { + MergeResults(this->_Cptr, c_panel, this->_ldc, y, ymax, + current.x0(), current.xmax(), _alpha, (current.k0() == 0 ? _beta : static_cast(1))); + }); + } + + if(pretransposed) + { + b_panel += (bblocks * strat.out_width * kern_k); + } + else + { + _bm->release(current.index()); + } + } + } + +public: + GemmInterleaved(GemmInterleaved &) = delete; + GemmInterleaved &operator=(GemmInterleaved &) = delete; + + /* Constructor */ + GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, + const bool trA, const bool trB, const Tr alpha, const Tr beta, const int maxthreads, + const bool pretransposed) + : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _trA(trA), _trB(trB), _alpha(alpha), _beta(beta), _maxthreads(maxthreads), _pretransposed(pretransposed) + { + const unsigned int L1_size = ci->get_L1_cache_size(); + const unsigned int L2_size = ci->get_L2_cache_size(); + + assert(maxthreads > 0); + + // Work out blocking parameters + + // k_block: Find out how much of the larger array can be loaded into half the cache. + // This should account for associative caches. + _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width, strategy::out_height))); + + // Needs to be (at least a single) multiple of the K unroll level. + _k_block /= strategy::k_unroll; + _k_block = std::max(_k_block, 1U) * strategy::k_unroll; + + // Now tune to presented problem size; this is how many blocks we need. + int num_k_blocks = iceildiv(K, _k_block); + + // So divide the space equally into that many blocks. + _k_block = iceildiv(K, num_k_blocks); + + // And round UP to the K unroll level required. + _k_block = iceildiv(_k_block, strategy::k_unroll); + _k_block *= strategy::k_unroll; + + // x_block: Work out how many rows (of length k_block) will fit in the L2 + // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. + _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width + strategy::out_height))) / (sizeof(Toi) * _k_block); + + // Needs to be (at least a single) multiple of the kernel output width. + _x_block /= strategy::out_width; + _x_block = std::max(_x_block, 1U) * strategy::out_width; + + // And tune to the presented problem size. + int num_x_blocks = iceildiv(N, _x_block); + _x_block = iceildiv(N, num_x_blocks); + + _x_block = iceildiv(_x_block, strategy::out_width); + _x_block *= strategy::out_width; + + // Work out the rounded size of M - needed for some buffers. + _Mround = iceildiv(M, strategy::out_height); + _Mround *= strategy::out_height; + } + + // Interface implementation - Compulsory functions + + // Window size: Only the last thread should do a ragged block, so dole out work in units of out_height */ + unsigned int get_window_size() const override + { + // _Mround is a multiple of out_height by definition. + return _Mround / strategy::out_height; + } + + // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. + void set_nthreads(int nthreads) override + { + if(_bm) + { + _bm->set_nthreads(nthreads); + } + } + + // Execute + void execute(unsigned int start, unsigned int end, int threadid) override + { + if(_pretransposed) + { + execute_internal(start, end, threadid); + } + else + { + execute_internal(start, end, threadid); + } + } + + // Interface implementation - working space + size_t get_working_size() const override + { + // In all cases, we need one A buffer plus a C buffer per thread. + size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads); + + // For pretransposed case, there is no working space needed for B. + // Otherwise, we need a BufferManager. + if(!_pretransposed) + { + size += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size()); + } + + size += 64; // Add on a cache line extra for alignment. + + return size; + } + + void set_working_space(void *working_space) override + { + // Make sure everything ends up cache line aligned + int8_t *working_space_bytes = reinterpret_cast(working_space); + intptr_t working_space_int = reinterpret_cast(working_space); + + size_t diff = 0; + + if(working_space_int & 0x3F) + { + diff = 0x40 - (working_space_int & 0x3F); + } + + working_space_bytes += diff; + + if(_pretransposed) + { + // Pretransposed case: just set internal pointer to parameter value. + _working_space = reinterpret_cast(working_space_bytes); + } + else + { + // Otherwise, use the first part of the working space for the buffer manager. + // It's legal to call this again so don't leak a buffer manager if it already existed. + delete _bm; + + _bm = new BufferManager(_maxthreads, get_b_working_size(), reinterpret_cast(working_space_bytes)); + + working_space_bytes += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size()); + + _working_space = reinterpret_cast(working_space_bytes); + } + } + + // Interface implementation - pretransposed + bool B_is_pretransposed() const override + { + return _pretransposed; + } + + bool B_pretranspose_required() const override + { + return _pretransposed && (_B_transposed == nullptr); + } + + // TODO: this could almost certainly be considerably simpler. + size_t get_B_pretransposed_array_size() const override + { + size_t total = 0; + blockwalker current(_Ksize, _k_block, _Nsize, _x_block); + + do + { + /* Figure out the size of each block. */ + size_t x_size = (current.xmax() - current.x0()); + size_t k_size = (current.kmax() - current.k0()); + + /* Round sizes up as needed. */ + x_size = iceildiv(x_size, strategy::out_width); + x_size *= strategy::out_width; + + k_size = iceildiv(k_size, strategy::k_unroll); + k_size *= strategy::k_unroll; + + total += x_size * k_size * sizeof(Toi); + } + while(current.advance()); + + return total; + } + + void pretranspose_B_array(void *in_buffer, const To *B, const int ldb) override + { + blockwalker current(_Ksize, _k_block, _Nsize, _x_block); + Toi *buffer = reinterpret_cast(in_buffer); + _B_transposed = buffer; + + do + { + /* Figure out the size of each block. */ + size_t x_size = (current.xmax() - current.x0()); + size_t k_size = (current.kmax() - current.k0()); + + /* Round sizes up as needed. */ + x_size = iceildiv(x_size, strategy::out_width); + x_size *= strategy::out_width; + + k_size = iceildiv(k_size, strategy::k_unroll); + k_size *= strategy::k_unroll; + + if(_trB ^ strategy::B_transpose) + { + Transform(buffer, B, ldb, current.x0(), current.xmax(), current.k0(), current.kmax()); + } + else + { + Transform(buffer, B, ldb, current.x0(), current.xmax(), current.k0(), current.kmax()); + } + + buffer += (x_size * k_size); + } + while(current.advance()); + } + + ~GemmInterleaved() override + { + delete _bm; + } +}; + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp new file mode 100644 index 0000000000..b0192793b9 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include + +#include "arm_gemm.hpp" + +#include "mergeresults.hpp" +#include "profiler.hpp" +#include "transform.hpp" + +namespace arm_gemm +{ +// Implementation of the GemmCommon abstract class. +// +// This is implementation is for native GEMM with no transposition. +// +// By default the source data is used in-place, but if type conversion is +// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED). + +template +class GemmNative : public GemmCommon +{ + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + const unsigned int _Msize; + const unsigned int _Nsize; + const unsigned int _Ksize; + + Tr _beta; + + const CPUInfo *const _ci; + + unsigned int k_block = 0; + unsigned int n_block = 0; + +public: + GemmNative(GemmNative &) = delete; + GemmNative &operator=(GemmNative &) = delete; + + GemmNative(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const Tr beta) + : _Msize(M), _Nsize(N), _Ksize(K), _beta(beta), _ci(ci) + { + /* For now don't do any blocking. TODO: figure out if we should. */ + k_block = K; + n_block = N; + } + + // Window is number of out_height blocks + unsigned int get_window_size() const override + { + return iceildiv(_Msize, strategy::out_height); + } + + // Actually execute the GEMM. + void execute(unsigned int start, unsigned int end, int) override + { + profiler prof; + strategy strat(_ci); + + unsigned int M_start = start * strategy::out_height; + unsigned int M_end = std::min(end * strategy::out_height, _Msize); + + static_assert(std::is_same::value, "gemm_native: Operand types must be the same."); + static_assert(std::is_same::value, "gemm_native: Result types must be the same."); + + for(unsigned int y0 = M_start; y0 < M_end; y0 += strategy::out_height) + { + unsigned int ymax = std::min(y0 + strategy::out_height, M_end); + + prof(PROFILE_KERNEL, (ymax - y0) * _Nsize * _Ksize, [&](void) + { + strat.kernel(this->_Aptr + (y0 * this->_lda), this->_lda, this->_Bptr, this->_ldb, this->_Cptr + (y0 * this->_ldc), this->_ldc, _beta, (ymax - y0), _Nsize, _Ksize); + }); + } + } +}; + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp new file mode 100644 index 0000000000..3e790e1b2a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "gemm_common.hpp" +#include "gemm_interleaved.hpp" + +#include "kernels/a64_gemm_u16_12x8.hpp" + +namespace arm_gemm +{ +template <> +UniqueGemmCommon gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K, + const bool trA, const bool trB, uint32_t alpha, uint32_t beta, + const int maxthreads, const bool pretransposed_hint) +{ + return UniqueGemmCommon(new GemmInterleaved(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint)); +} + +// Instantiate static class members +const int gemm_u16_12x8::out_width; +const int gemm_u16_12x8::out_height; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp new file mode 100644 index 0000000000..9ec479ca7c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include "arm_gemm.hpp" +#include "gemm_common.hpp" +#include "gemm_interleaved.hpp" + +#include "kernels/a64_gemm_u8_12x8.hpp" +#include "kernels/a64_gemm_u8_4x4.hpp" + +namespace arm_gemm +{ +template <> +UniqueGemmCommon gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K, + const bool trA, const bool trB, const uint32_t alpha, const uint32_t beta, + const int maxthreads, const bool pretransposed_hint) +{ + if(ci.has_dotprod()) + { + // Dot product supporting CPUs. This family has a special version for A55r1. + return UniqueGemmCommon(new GemmInterleaved(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint)); + } + + // Non dot-product code. + return UniqueGemmCommon(new GemmInterleaved(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint)); + + // TODO: There's a better approach for A53, but it doesn't work + // well on heterogeneous systems as the required data formats + // are different. Figure out how to enable this: + // gemm = new GemmInterleaved(ci, M, N, K, trA, trB); +} + +// Instantiate static class members +const int gemm_u8_12x8::out_width; +const int gemm_u8_12x8::out_height; + +const int gemm_u8_4x4::out_width; +const int gemm_u8_4x4::out_height; + +} // namespace arm_gemm + +#endif // aarch64 diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp new file mode 100644 index 0000000000..c0b886266d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include + +#include "arm_gemm.hpp" + +#include "mergeresults.hpp" +#include "profiler.hpp" +#include "transform.hpp" + +namespace arm_gemm +{ +// Implementation of the GemmCommon abstract class. +// +// This is implementation is for a "native" (no-transform) GEMV with a +// transposed matrix. +// +// As a native operation the source data is used in-place, so the internal +// and external operand/result types must match. +template +class GemvNativeTransposed : public GemmCommon +{ + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + const unsigned int _Nsize; + const unsigned int _Ksize; + + const Tr _alpha; + + const CPUInfo *const _ci; + + unsigned int m_block = 0; + unsigned int n_block = 0; + +public: + GemvNativeTransposed(GemvNativeTransposed &) = delete; + GemvNativeTransposed &operator=(GemvNativeTransposed &) = delete; + + GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const Tr alpha) + : _Nsize(N), _Ksize(K), _alpha(alpha), _ci(ci) + { + /* For now don't do any blocking. TODO: figure out if we should. */ + m_block = K; + n_block = N; + } + + // Window is number of out_width blocks. + unsigned int get_window_size() const override + { + return iceildiv(_Nsize, strategy::out_width); + } + + // Actually execute the GEMV. + void execute(unsigned int start, unsigned int end, int) override + { + profiler prof; + strategy strat(_ci); + + unsigned int N_start = start * strategy::out_width; + unsigned int N_end = std::min(end * strategy::out_width, _Nsize); + + static_assert(std::is_same::value, "gemv_transposed: Operand types must be the same."); + static_assert(std::is_same::value, "gemv_transposed: Result types must be the same."); + + for(unsigned int m0 = 0; m0 < _Ksize; m0 += m_block) + { + unsigned int mmax = std::min(m0 + m_block, _Ksize); + + for(unsigned int n0 = N_start; n0 < N_end; n0 += n_block) + { + unsigned int nmax = std::min(n0 + n_block, N_end); + + prof(PROFILE_KERNEL, ((mmax - m0) * (nmax - n0)), [&](void) + { + strat.kernel(this->_Bptr + (m0 * this->_ldb) + n0, this->_Aptr + m0, this->_Cptr + n0, + _alpha, this->_ldb, (mmax - m0), (nmax - n0)); + }); + } + } + } +}; + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp new file mode 100644 index 0000000000..0df331acb4 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include + +#include "arm_gemm.hpp" + +#include "mergeresults.hpp" +#include "profiler.hpp" +#include "transform.hpp" + +namespace arm_gemm +{ +// Implementation of the GemmCommon abstract class. +// +// This is implementation is for GEMV with a transposed matrix. +// +// By default the source data is used in-place, but if type conversion is +// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED). + +template +class GemvPretransposed : public GemmCommon +{ + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + const unsigned int _Nsize; + const unsigned int _Ksize; + + const bool _trB; + + const Tr _beta; + + const CPUInfo *const _ci; + + unsigned int m_block = 0; + unsigned int n_block = 0; + + const Toi *_A_pretransposed = nullptr; + +public: + GemvPretransposed(GemvPretransposed &) = delete; + GemvPretransposed &operator=(GemvPretransposed &) = delete; + + GemvPretransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const bool trB, const Tr beta) + : _Nsize(N), _Ksize(K), _trB(trB), _beta(beta), _ci(ci) + { + /* For now don't do any blocking. TODO: figure out if we should. */ + m_block = K; + n_block = N; + } + + // Window is number of out_width blocks. + unsigned int get_window_size() const override + { + return iceildiv(_Nsize, strategy::out_width); + } + + // Actually execute the GEMV. + void execute(unsigned int start, unsigned int end, int) override + { + profiler prof; + strategy strat(_ci); + + unsigned int N_start = start * strategy::out_width; + unsigned int N_end = std::min(end * strategy::out_width, _Nsize); + + static_assert(std::is_same::value, "GemvPretransposed: Result types must be the same."); + + for(unsigned int m0 = 0; m0 < _Ksize; m0 += m_block) + { + unsigned int mmax = std::min(m0 + m_block, _Ksize); + + for(unsigned int n0 = N_start; n0 < N_end; n0 += n_block) + { + unsigned int nmax = std::min(n0 + n_block, N_end); + + prof(PROFILE_KERNEL, ((mmax - m0) * (nmax - n0)), [&](void) + { + /* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */ + strat.kernel(_A_pretransposed + (n0 * _Ksize) + (m0 * strategy::A_interleave), (_Ksize * strategy::A_interleave), this->_Aptr + m0, this->_Cptr + n0, _beta, (mmax - m0), (nmax - n0)); + }); + } + } + } + + /* Pretransposed interface implementation */ + bool B_is_pretransposed() const override + { + return true; + } + + bool B_pretranspose_required() const override + { + /* Transpose is required if _A_pretransposed is still nullptr */ + return (_A_pretransposed == nullptr); + } + + size_t get_B_pretransposed_array_size() const override + { + return _Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave * sizeof(float); + } + + void pretranspose_B_array(void *buffer, const To *B, const int ldb) override + { + Toi *A_buffer = reinterpret_cast(buffer); + + /* Reverse sense here as we are dealing with B rather than A. So if + * strategy::A_transpose is false and _trB is false, we still + * transpose. */ + if(_trB ^ strategy::A_transpose) + { + Transform(A_buffer, B, ldb, 0, _Nsize, 0, _Ksize); + } + else + { + Transform(A_buffer, B, ldb, 0, _Nsize, 0, _Ksize); + } + + _A_pretransposed = A_buffer; + } +}; + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp new file mode 100644 index 0000000000..de11dc582c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __arm__ + +namespace arm_gemm +{ +// Actual kernel implementations +void a32_sgemm_8x6(const float *, const float *, float *, int, int, int); +void a32_sgemm_8x6_a53(const float *, const float *, float *, int, int, int); +void a32_sgemm_8x6_a55r1(const float *, const float *, float *, int, int, int); + +// 8x6 SGEMM "strategy" class. +// +// This describes the characteristics of a family of kernels, in terms of +// the required interleave properties and the output block size. +// +// All kernels in the family must share these characteristics. The actual +// kernel to be used can be chosen at runtime, based on the CPU_type +// structure. +class sgemm_8x6 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, const float *, float *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 6; + static const int A_block = 1; + static const int A_transpose = 0; + + /* Same for B input */ + static const int B_interleave = 8; + static const int B_block = 1; + static const int B_transpose = 1; + + /* Kernel blocking parameters */ + static const int out_width = 8; + static const int out_height = 6; + static const int k_unroll = 1; + + kern_type kernel = a32_sgemm_8x6; + + sgemm_8x6(const CPUInfo *ci) + { + switch(ci->get_cpu_model()) + { + case CPUModel::A53: + kernel = a32_sgemm_8x6_a53; + break; + + case CPUModel::A55r1: + kernel = a32_sgemm_8x6_a55r1; + break; + + default: + kernel = a32_sgemm_8x6; + break; + } + } +}; + +} // namespace arm_gemm +#endif // __arm__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp new file mode 100644 index 0000000000..428498f79e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp @@ -0,0 +1,400 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __arm__ + +#include + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 8x6), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm +{ +void a32_sgemm_8x6_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) +{ + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for(int yb = 0; yb < ablocks; yb++) + { + const float *a_ptr0 = a_ptr; + const float *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + int tails = (K & 3); + if(tails == 0) + { + tails = 4; + } + int k = ((K + 3) / 4) - 1; + + __asm __volatile( + "vmov.i32 q4, #0\n" + "vld1.32 {d0-d1}, [%[a_ptr] :64]\n" + "vmov.i32 q5, #0\n" + "vld1.32 {d4-d5}, [%[b_ptr] :128]\n" + "vmov.i32 q6, #0\n" + "ldr r0, [%[a_ptr], #0x10]\n" + "vmov.i32 q7, #0\n" + "ldr r1, [%[a_ptr], #0x14]\n" + "vmov.i32 q8, #0\n" ASM_PREFETCH("[%[a_ptr], #0x40]") "vmov.i32 q9, #0\n" ASM_PREFETCH("[%[b_ptr], #0x40]") "vmov.i32 q10, #0\n" ASM_PREFETCH("[%[a_ptr], #0x80]") "vmov.i32 q11, #0\n" + ASM_PREFETCH("[%[b_ptr], #0x80]") + "vmov.i32 q12, #0\n" + "vmov.i32 q13, #0\n" ASM_PREFETCH("[%[a_ptr], #0xC0]") "vmov.i32 q14, #0\n" ASM_PREFETCH("[%[b_ptr], #0XC0]") + "vmov.i32 q15, #0\n" + "cmp %[k], #0\n" + "beq 6f\n" + + "1:\n" + // Unroll 0 + "vldr d6, [%[b_ptr], #0x10]\n" + "vmov d2, r0, r1\n" + "vmla.f32 q4, q2, d0[0]\n" + "ldr r0, [%[b_ptr], #0x18]\n" + "vmla.f32 q5, q2, d0[1]\n" + "ldr r1, [%[b_ptr], #0x1C]\n" + "vmla.f32 q6, q2, d1[0]\n" + + "vldr d3, [%[a_ptr], #0x18]\n" + "vmov d7, r0, r1\n" + "vmla.f32 q7, q2, d1[1]\n" ASM_PREFETCH("[%[a_ptr], #0x100]") + "vmla.f32 q8, q2, d2[0]\n" + "vmla.f32 q9, q2, d2[1]\n" + + "vldr d4, [%[b_ptr], #0x20]\n" + "vmla.f32 q10, q3, d0[0]\n" + "ldr r0, [%[b_ptr], #0x28]\n" + "vmla.f32 q11, q3, d0[1]\n" + "ldr r1, [%[b_ptr], #0x2C]\n" + "vmla.f32 q12, q3, d1[0]\n" + + "vldr d0, [%[a_ptr], #0x20]\n" + "vmov d5, r0, r1\n" + "vmla.f32 q13, q3, d1[1]\n" + "ldr r0, [%[a_ptr], #0x28]\n" + "vmla.f32 q14, q3, d2[0]\n" + "ldr r1, [%[a_ptr], #0x2C]\n" + "vmla.f32 q15, q3, d2[1]\n" + + // Unroll 1 + "vldr d6, [%[b_ptr], #0x30]\n" + "vmov d1, r0, r1\n" + "vmla.f32 q4, q2, d3[0]\n" + "ldr r0, [%[b_ptr], #0x38]\n" + "vmla.f32 q5, q2, d3[1]\n" + "ldr r1, [%[b_ptr], #0x3C]\n" + "vmla.f32 q6, q2, d0[0]\n" + + "vldr d2, [%[a_ptr], #0x30]\n" + "vmov d7, r0, r1\n" + "vmla.f32 q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #0x100]") + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q9, q2, d1[1]\n" + + "vldr d4, [%[b_ptr], #0x40]\n" + "vmla.f32 q10, q3, d3[0]\n" + "ldr r0, [%[b_ptr], #0x48]\n" + "vmla.f32 q11, q3, d3[1]\n" + "ldr r1, [%[b_ptr], #0x4C]\n" + "vmla.f32 q12, q3, d0[0]\n" + + "vldr d3, [%[a_ptr], #0x38]\n" + "vmov d5, r0, r1\n" + "vmla.f32 q13, q3, d0[1]\n" + "ldr r0, [%[a_ptr], #0x40]\n" + "vmla.f32 q14, q3, d1[0]\n" + "ldr r1, [%[a_ptr], #0x44]\n" + "vmla.f32 q15, q3, d1[1]\n" + + // Unroll 2 + "vldr d6, [%[b_ptr], #0x50]\n" + "vmov d0, r0, r1\n" + "vmla.f32 q4, q2, d2[0]\n" + "ldr r0, [%[b_ptr], #0x58]\n" + "vmla.f32 q5, q2, d2[1]\n" + "ldr r1, [%[b_ptr], #0x5C]\n" + "vmla.f32 q6, q2, d3[0]\n" + + "vldr d1, [%[a_ptr], #0x48]\n" + "vmov d7, r0, r1\n" + "vmla.f32 q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #0x140]") + "vmla.f32 q8, q2, d0[0]\n" + "vmla.f32 q9, q2, d0[1]\n" + + "vldr d4, [%[b_ptr], #0x60]\n" + "vmla.f32 q10, q3, d2[0]\n" + "ldr r0, [%[b_ptr], #0x68]\n" + "vmla.f32 q11, q3, d2[1]\n" + "ldr r1, [%[b_ptr], #0x6C]\n" + "vmla.f32 q12, q3, d3[0]\n" + + "vldr d2, [%[a_ptr], #0x50]\n" + "vmov d5, r0, r1\n" + "vmla.f32 q13, q3, d3[1]\n" + "ldr r0, [%[a_ptr], #0x58]\n" + "vmla.f32 q14, q3, d0[0]\n" + "ldr r1, [%[a_ptr], #0x5C]\n" + "vmla.f32 q15, q3, d0[1]\n" + "add %[a_ptr], %[a_ptr], #0x60\n" + + // Unroll 3 + "vldr d6, [%[b_ptr], #0x70]\n" + "vmov d3, r0, r1\n" + "vmla.f32 q4, q2, d1[0]\n" + "ldr r0, [%[b_ptr], #0x78]\n" + "vmla.f32 q5, q2, d1[1]\n" + "ldr r1, [%[b_ptr], #0x7C]\n" + "vmla.f32 q6, q2, d2[0]\n" + "add %[b_ptr], %[b_ptr], #0x80\n" + + "vldr d0, [%[a_ptr], #0x00]\n" + "vmov d7, r0, r1\n" + "vmla.f32 q7, q2, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #0xC0]") + "vmla.f32 q8, q2, d3[0]\n" + "vmla.f32 q9, q2, d3[1]\n" + + "vldr d4, [%[b_ptr], #0x00]\n" + "vmla.f32 q10, q3, d1[0]\n" + "ldr r0, [%[b_ptr], #0x08]\n" + "vmla.f32 q11, q3, d1[1]\n" + "ldr r1, [%[b_ptr], #0x0C]\n" + "vmla.f32 q12, q3, d2[0]\n" + "subs %[k], %[k], #1\n" + + "vldr d1, [%[a_ptr], #0x08]\n" + "vmov d5, r0, r1\n" + "vmla.f32 q13, q3, d2[1]\n" + "ldr r0, [%[a_ptr], #0x10]\n" + "vmla.f32 q14, q3, d3[0]\n" + "ldr r1, [%[a_ptr], #0x14]\n" + "vmla.f32 q15, q3, d3[1]\n" + "bne 1b\n" + + // "Tails" shows how many multiply blocks are needed at the + // end, must be 1-4 inclusive. Bail out to alternative tail + // immediately if it's 1. + "6:\n" + "subs %[tails], %[tails], #1\n" + "beq 3f\n" + + // Detached final iteration - for now adapt the generic + // tails rather than reimplementing for A53. + + // Unroll 0 + "vmov d2, r0, r1\n" + "add %[a_ptr], %[a_ptr], #0x18\n" + "vmla.f32 q4, q2, d0[0]\n" + "vld1.32 {d3}, [%[a_ptr] :64]!\n" + "vmla.f32 q5, q2, d0[1]\n" + "add %[b_ptr], %[b_ptr], #0x10\n" + "vmla.f32 q6, q2, d1[0]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + "vmla.f32 q7, q2, d1[1]\n" + "vmla.f32 q8, q2, d2[0]\n" + "subs %[tails], %[tails], #1\n" + "vmla.f32 q9, q2, d2[1]\n" + "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" + + "vmla.f32 q10, q3, d0[0]\n" + "vmla.f32 q11, q3, d0[1]\n" + "vmla.f32 q12, q3, d1[0]\n" + "vmla.f32 q13, q3, d1[1]\n" + "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" + "vmla.f32 q14, q3, d2[0]\n" + "vmla.f32 q15, q3, d2[1]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + "beq 4f\n" + + // Unroll 1 + "vmla.f32 q4, q2, d3[0]\n" + "vmla.f32 q5, q2, d3[1]\n" + "subs %[tails], %[tails], #1\n" + "vmla.f32 q6, q2, d0[0]\n" + "vmla.f32 q7, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q9, q2, d1[1]\n" + "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" + + "vmla.f32 q10, q3, d3[0]\n" + "vmla.f32 q11, q3, d3[1]\n" + "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" + "vmla.f32 q12, q3, d0[0]\n" + "vmla.f32 q13, q3, d0[1]\n" + "vmla.f32 q14, q3, d1[0]\n" + "vmla.f32 q15, q3, d1[1]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + "beq 5f\n" + + // Unroll 2 + "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" + "vmla.f32 q4, q2, d2[0]\n" + "vmla.f32 q5, q2, d2[1]\n" + "vmla.f32 q6, q2, d3[0]\n" + "vmla.f32 q7, q2, d3[1]\n" + "vmla.f32 q8, q2, d0[0]\n" + "vmla.f32 q9, q2, d0[1]\n" + "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" + + "vmla.f32 q10, q3, d2[0]\n" + "vmla.f32 q11, q3, d2[1]\n" + "vmla.f32 q12, q3, d3[0]\n" + "vmla.f32 q13, q3, d3[1]\n" + "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" + "vmla.f32 q14, q3, d0[0]\n" + "vmla.f32 q15, q3, d0[1]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + + // Unroll 3 + "vmla.f32 q4, q2, d1[0]\n" + "vmla.f32 q10, q3, d1[0]\n" + "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" + "vmla.f32 q5, q2, d1[1]\n" + "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" + "vmla.f32 q11, q3, d1[1]\n" + "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" + "vmla.f32 q6, q2, d2[0]\n" + "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" + "vmla.f32 q12, q3, d2[0]\n" + "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" + "vmla.f32 q7, q2, d2[1]\n" + "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" + "vmla.f32 q13, q3, d2[1]\n" + "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" + "vmla.f32 q8, q2, d3[0]\n" + "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" + "vmla.f32 q14, q3, d3[0]\n" + "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" + "vmla.f32 q9, q2, d3[1]\n" + "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" + "vmla.f32 q15, q3, d3[1]\n" + "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" + "b 2f\n" + + // tails==1 final tail + "3:\n" + "vmov d2, r0, r1\n" + "add %[b_ptr], %[b_ptr], #0x10\n" + "vmla.f32 q4, q2, d0[0]\n" + "add %[a_ptr], %[a_ptr], #0x18\n" + "vmla.f32 q5, q2, d0[1]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + "vmla.f32 q6, q2, d1[0]\n" + "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" + "vmla.f32 q10, q3, d0[0]\n" + "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" + "vmla.f32 q11, q3, d0[1]\n" + "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" + "vmla.f32 q12, q3, d1[0]\n" + "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" + "vmla.f32 q7, q2, d1[1]\n" + "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" + "vmla.f32 q13, q3, d1[1]\n" + "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" + "vmla.f32 q8, q2, d2[0]\n" + "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" + "vmla.f32 q14, q3, d2[0]\n" + "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" + "vmla.f32 q9, q2, d2[1]\n" + "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" + "vmla.f32 q15, q3, d2[1]\n" + "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" + "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" + "b 2f\n" + + // tails==2 final tail + "4:\n" + "vmla.f32 q4, q2, d3[0]\n" + "vmla.f32 q10, q3, d3[0]\n" + "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" + "vmla.f32 q5, q2, d3[1]\n" + "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" + "vmla.f32 q11, q3, d3[1]\n" + "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" + "vmla.f32 q6, q2, d0[0]\n" + "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" + "vmla.f32 q12, q3, d0[0]\n" + "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" + "vmla.f32 q7, q2, d0[1]\n" + "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" + "vmla.f32 q13, q3, d0[1]\n" + "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" + "vmla.f32 q8, q2, d1[0]\n" + "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" + "vmla.f32 q14, q3, d1[0]\n" + "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" + "vmla.f32 q9, q2, d1[1]\n" + "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" + "vmla.f32 q15, q3, d1[1]\n" + "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" + "b 2f\n" + + // tails==3 final tail + "5:\n" + "vmla.f32 q4, q2, d2[0]\n" + "vld1.32 {d0}, [%[a_ptr] :64]!\n" + "vmla.f32 q5, q2, d2[1]\n" + "vmla.f32 q6, q2, d3[0]\n" + "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" + "vmla.f32 q10, q3, d2[0]\n" + "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" + "vmla.f32 q11, q3, d2[1]\n" + "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" + "vmla.f32 q12, q3, d3[0]\n" + "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" + "vmla.f32 q7, q2, d3[1]\n" + "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" + "vmla.f32 q13, q3, d3[1]\n" + "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" + "vmla.f32 q8, q2, d0[0]\n" + "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" + "vmla.f32 q14, q3, d0[0]\n" + "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" + "vmla.f32 q9, q2, d0[1]\n" + "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" + "vmla.f32 q15, q3, d0[1]\n" + "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" + "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" + + "2:\n" + "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n" + : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1"); + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp new file mode 100644 index 0000000000..4cfb72a455 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __arm__ + +#include + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 8x6), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm +{ +void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) +{ + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + /* Work out starting values for "k" and "tails" in the inner loop. */ + int tails_initial = (K & 3); + if(tails_initial == 0) + { + tails_initial = 4; + } + + int k_initial = ((K + 3) / 4) - 1; + + for(int yb = 0; yb < ablocks; yb++) + { + const float *a_ptr0 = a_ptr; + const float *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + int tails = tails_initial; + int k = k_initial; + + a_ptr = a_ptr0; + + __asm __volatile( + "vldr d0, [%[a_ptr]]\n" + "vmov.i32 q4, #0\n" + "vldr d1, [%[a_ptr], #0x08]\n" + "vmov.i32 q5, #0\n" + "vldr d4, [%[b_ptr]]\n" + "vmov.i32 q6, #0\n" + "vldr d5, [%[b_ptr], #0x08]\n" + "vmov.i32 q7, #0\n" + "vldr d2, [%[a_ptr], #0x10]\n" + "vmov.i32 q8, #0\n" ASM_PREFETCH("[%[b_ptr], #0x40]") "vmov.i32 q9, #0\n" ASM_PREFETCH("[%[a_ptr], #0x40]") "vmov.i32 q10, #0\n" ASM_PREFETCH("[%[b_ptr], #0x80]") "vmov.i32 q11, #0\n" + ASM_PREFETCH("[%[a_ptr], #0x80]") "vmov.i32 q12, #0\n" ASM_PREFETCH("[%[b_ptr], #0XC0]") "vmov.i32 q13, #0\n" ASM_PREFETCH("[%[a_ptr], #0xC0]") "vmov.i32 q14, #0\n" + ASM_PREFETCH("[%[b_ptr], #0x100]") "vmov.i32 q15, #0\n" ASM_PREFETCH("[%[a_ptr], #0x100]") "cmp %[k], #0\n" ASM_PREFETCH("[%[b_ptr], #0x140]") "beq 6f\n" + ASM_PREFETCH("[%[b_ptr], #0x180]") + + "1:\n" + // Unroll 0 + "vmla.f32 q4, q2, d0[0]\n" + "vldr d6, [%[b_ptr], #0x10]\n" + "vmla.f32 q5, q2, d0[1]\n" + "vldr d7, [%[b_ptr], #0x18]\n" + "vmla.f32 q6, q2, d1[0]\n" + "vldr d3, [%[a_ptr], #0x18]\n" + "vmla.f32 q7, q2, d1[1]\n" ASM_PREFETCH("[%[a_ptr], #0x140]") + "vmla.f32 q8, q2, d2[0]\n" + "subs %[k], %[k], #1\n" + "vmla.f32 q9, q2, d2[1]\n" + "vldr d4, [%[b_ptr], #0x20]\n" + "vmla.f32 q10, q3, d0[0]\n" + "vldr d5, [%[b_ptr], #0x28]\n" + "vmla.f32 q11, q3, d0[1]\n" + "vldr d0, [%[a_ptr], #0x20]\n" + "vmla.f32 q12, q3, d1[0]\n" + + "vmla.f32 q13, q3, d1[1]\n" + "vldr d1, [%[a_ptr], #0x28]\n" + "vmla.f32 q14, q3, d2[0]\n" + + "vmla.f32 q15, q3, d2[1]\n" + "vldr d6, [%[b_ptr], #0x30]\n" + + // Unroll 1 + "vmla.f32 q4, q2, d3[0]\n" + "vldr d7, [%[b_ptr], #0x38]\n" + "vmla.f32 q5, q2, d3[1]\n" + "vldr d2, [%[a_ptr], #0x30]\n" + "vmla.f32 q6, q2, d0[0]\n" + + "vmla.f32 q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #0x1C0]") + "vmla.f32 q8, q2, d1[0]\n" + + "vmla.f32 q9, q2, d1[1]\n" + "vldr d4, [%[b_ptr], #0x40]\n" + "vmla.f32 q10, q3, d3[0]\n" + "vldr d5, [%[b_ptr], #0x48]\n" + "vmla.f32 q11, q3, d3[1]\n" + "vldr d3, [%[a_ptr], #0x38]\n" + "vmla.f32 q12, q3, d0[0]\n" + + "vmla.f32 q13, q3, d0[1]\n" + "vldr d0, [%[a_ptr], #0x40]\n" + "vmla.f32 q14, q3, d1[0]\n" + + "vmla.f32 q15, q3, d1[1]\n" + "vldr d6, [%[b_ptr], #0x50]\n" + + // Unroll 2 + "vmla.f32 q4, q2, d2[0]\n" + "vldr d7, [%[b_ptr], #0x58]\n" + "vmla.f32 q5, q2, d2[1]\n" + "vldr d1, [%[a_ptr], #0x48]\n" + "vmla.f32 q6, q2, d3[0]\n" + + "vmla.f32 q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #0x180]") + "vmla.f32 q8, q2, d0[0]\n" + + "vmla.f32 q9, q2, d0[1]\n" + "vldr d4, [%[b_ptr], #0x60]\n" + "vmla.f32 q10, q3, d2[0]\n" + "vldr d5, [%[b_ptr], #0x68]\n" + "vmla.f32 q11, q3, d2[1]\n" + "vldr d2, [%[a_ptr], #0x50]\n" + "vmla.f32 q12, q3, d3[0]\n" + + "vmla.f32 q13, q3, d3[1]\n" + "vldr d3, [%[a_ptr], #0x58]\n" + "vmla.f32 q14, q3, d0[0]\n" + "add %[a_ptr], %[a_ptr], #0x60\n" + "vmla.f32 q15, q3, d0[1]\n" + "vldr d6, [%[b_ptr], #0x70]\n" + + // Unroll 3 + "vmla.f32 q4, q2, d1[0]\n" + "vldr d7, [%[b_ptr], #0x78]\n" + "vmla.f32 q5, q2, d1[1]\n" + "add %[b_ptr], %[b_ptr], #0x80\n" + "vmla.f32 q6, q2, d2[0]\n" + "vldr d0, [%[a_ptr], #0x00]\n" + "vmla.f32 q7, q2, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #0x180]") + "vmla.f32 q8, q2, d3[0]\n" + + "vmla.f32 q9, q2, d3[1]\n" + "vldr d4, [%[b_ptr], #0x00]\n" + "vmla.f32 q10, q3, d1[0]\n" + "vldr d5, [%[b_ptr], #0x08]\n" + "vmla.f32 q11, q3, d1[1]\n" + "vldr d1, [%[a_ptr], #0x08]\n" + "vmla.f32 q12, q3, d2[0]\n" + + "vmla.f32 q13, q3, d2[1]\n" + "vldr d2, [%[a_ptr], #0x10]\n" + "vmla.f32 q14, q3, d3[0]\n" + + "vmla.f32 q15, q3, d3[1]\n" + "bne 1b\n" + + // "Tails" shows how many multiply blocks are needed at the + // end, must be 1-4 inclusive. Bail out to alternative tail + // immediately if it's 1. + "6:\n" + "subs %[tails], %[tails], #1\n" + "beq 3f\n" + + // Detached final iteration + + // Unroll 0 + "vmla.f32 q4, q2, d0[0]\n" + "vldr d6, [%[b_ptr], #0x10]\n" + "vmla.f32 q5, q2, d0[1]\n" + "vldr d7, [%[b_ptr], #0x18]\n" + "vmla.f32 q6, q2, d1[0]\n" + "vldr d3, [%[a_ptr], #0x18]\n" + "vmla.f32 q7, q2, d1[1]\n" + "subs %[tails], %[tails], #1\n" + "vmla.f32 q8, q2, d2[0]\n" + "vmla.f32 q9, q2, d2[1]\n" + "vldr d4, [%[b_ptr], #0x20]\n" + + "vmla.f32 q10, q3, d0[0]\n" + "vldr d5, [%[b_ptr], #0x28]\n" + "vmla.f32 q11, q3, d0[1]\n" + "vldr d0, [%[a_ptr], #0x20]\n" + "vmla.f32 q12, q3, d1[0]\n" + "add %[b_ptr], %[b_ptr], #0x30\n" + "vmla.f32 q13, q3, d1[1]\n" + "vldr d1, [%[a_ptr], #0x28]\n" + "vmla.f32 q14, q3, d2[0]\n" + "vmla.f32 q15, q3, d2[1]\n" + "beq 4f\n" + + // Unroll 1 + "vmla.f32 q4, q2, d3[0]\n" + "vldr d6, [%[b_ptr], #0x30]\n" + "vmla.f32 q5, q2, d3[1]\n" + "vldr d7, [%[b_ptr], #0x38]\n" + "vmla.f32 q6, q2, d0[0]\n" + "vldr d2, [%[a_ptr], #0x30]\n" + "vmla.f32 q7, q2, d0[1]\n" + "subs %[tails], %[tails], #1\n" + "vmla.f32 q8, q2, d1[0]\n" + + "vmla.f32 q9, q2, d1[1]\n" + + "vmla.f32 q10, q3, d3[0]\n" + "vldr d4, [%[b_ptr], #0x40]\n" + "vmla.f32 q11, q3, d3[1]\n" + "vldr d5, [%[b_ptr], #0x48]\n" + "vmla.f32 q12, q3, d0[0]\n" + "vldr d3, [%[a_ptr], #0x38]\n" + "vmla.f32 q13, q3, d0[1]\n" + "vldr d0, [%[a_ptr], #0x40]\n" + "vmla.f32 q14, q3, d1[0]\n" + "vmla.f32 q15, q3, d1[1]\n" + "beq 5f\n" + + // Unroll 2 + "vmla.f32 q4, q2, d2[0]\n" + "vldr d6, [%[b_ptr], #0x50]\n" + "vmla.f32 q5, q2, d2[1]\n" + "vldr d7, [%[b_ptr], #0x58]\n" + "vmla.f32 q6, q2, d3[0]\n" + "vldr d1, [%[a_ptr], #0x48]\n" + "vmla.f32 q7, q2, d3[1]\n" + "vmla.f32 q8, q2, d0[0]\n" + "vmla.f32 q9, q2, d0[1]\n" + + "vmla.f32 q10, q3, d2[0]\n" + "vldr d4, [%[b_ptr], #0x60]\n" + "vmla.f32 q11, q3, d2[1]\n" + "vldr d5, [%[b_ptr], #0x68]\n" + "vmla.f32 q12, q3, d3[0]\n" + "vldr d2, [%[a_ptr], #0x50]\n" + "vmla.f32 q13, q3, d3[1]\n" + "vldr d3, [%[a_ptr], #0x58]\n" + "vmla.f32 q14, q3, d0[0]\n" + "vmla.f32 q15, q3, d0[1]\n" + + // Unroll 3 + "vmla.f32 q4, q2, d1[0]\n" + "vldr d6, [%[b_ptr], #0x70]\n" + "vmla.f32 q5, q2, d1[1]\n" + "vldr d7, [%[b_ptr], #0x78]\n" + "vmla.f32 q10, q3, d1[0]\n" + "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" + "vmla.f32 q11, q3, d1[1]\n" + "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" + "vmla.f32 q6, q2, d2[0]\n" + "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" + "vmla.f32 q12, q3, d2[0]\n" + "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" + "vmla.f32 q7, q2, d2[1]\n" + "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" + "vmla.f32 q13, q3, d2[1]\n" + "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" + "vmla.f32 q8, q2, d3[0]\n" + "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" + "vmla.f32 q14, q3, d3[0]\n" + "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" + "vmla.f32 q9, q2, d3[1]\n" + "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" + "vmla.f32 q15, q3, d3[1]\n" + "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" + "add %[a_ptr], %[a_ptr], #0x60\n" + "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" + "add %[b_ptr], %[b_ptr], #0x80\n" + "b 2f\n" + + // tails==1 final tail + "3:\n" + "vmla.f32 q4, q2, d0[0]\n" + "vldr d6, [%[b_ptr], #0x10]\n" + "vmla.f32 q5, q2, d0[1]\n" + "vldr d7, [%[b_ptr], #0x18]\n" + "vmla.f32 q6, q2, d1[0]\n" + "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" + "vmla.f32 q10, q3, d0[0]\n" + "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" + "vmla.f32 q11, q3, d0[1]\n" + "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" + "vmla.f32 q12, q3, d1[0]\n" + "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" + "vmla.f32 q7, q2, d1[1]\n" + "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" + "vmla.f32 q13, q3, d1[1]\n" + "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" + "vmla.f32 q8, q2, d2[0]\n" + "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" + "vmla.f32 q14, q3, d2[0]\n" + "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" + "vmla.f32 q9, q2, d2[1]\n" + "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" + "vmla.f32 q15, q3, d2[1]\n" + "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" + "add %[a_ptr], %[a_ptr], #0x18\n" + "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" + "add %[b_ptr], %[b_ptr], #0x20\n" + "b 2f\n" + + // tails==2 final tail + "4:\n" + "vmla.f32 q4, q2, d3[0]\n" + "vldr d6, [%[b_ptr], #0x30]\n" + "vmla.f32 q5, q2, d3[1]\n" + "vldr d7, [%[b_ptr], #0x38]\n" + "vmla.f32 q10, q3, d3[0]\n" + "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" + "vmla.f32 q11, q3, d3[1]\n" + "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" + "vmla.f32 q6, q2, d0[0]\n" + "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" + "vmla.f32 q12, q3, d0[0]\n" + "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" + "vmla.f32 q7, q2, d0[1]\n" + "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" + "vmla.f32 q13, q3, d0[1]\n" + "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" + "vmla.f32 q8, q2, d1[0]\n" + "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" + "vmla.f32 q14, q3, d1[0]\n" + "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" + "vmla.f32 q9, q2, d1[1]\n" + "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" + "vmla.f32 q15, q3, d1[1]\n" + "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" + "add %[b_ptr], %[b_ptr], #0x40\n" + "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" + "add %[a_ptr], %[a_ptr], #0x30\n" + "b 2f\n" + + // tails==3 final tail + "5:\n" + "vmla.f32 q4, q2, d2[0]\n" + "vldr d6, [%[b_ptr], #0x50]\n" + "vmla.f32 q5, q2, d2[1]\n" + "vldr d7, [%[b_ptr], #0x58]\n" + "vmla.f32 q6, q2, d3[0]\n" + "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" + "vmla.f32 q10, q3, d2[0]\n" + "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" + "vmla.f32 q11, q3, d2[1]\n" + "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" + "vmla.f32 q12, q3, d3[0]\n" + "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" + "vmla.f32 q7, q2, d3[1]\n" + "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" + "vmla.f32 q13, q3, d3[1]\n" + "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" + "vmla.f32 q8, q2, d0[0]\n" + "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" + "vmla.f32 q14, q3, d0[0]\n" + "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" + "vmla.f32 q9, q2, d0[1]\n" + "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" + "vmla.f32 q15, q3, d0[1]\n" + "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" + "add %[a_ptr], %[a_ptr], #0x48\n" + "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" + "add %[b_ptr], %[b_ptr], #0x60\n" + + "2:\n" + "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n" + : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1"); + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp new file mode 100644 index 0000000000..d7d0484610 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __arm__ + +#include + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 8x6), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm +{ +void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) +{ + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for(int yb = 0; yb < ablocks; yb++) + { + const float *a_ptr0 = a_ptr; + const float *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + int tails = (K & 3); + if(tails == 0) + { + tails = 4; + } + int k = ((K + 3) / 4) - 1; + + __asm __volatile( + "vmov.i32 q4, #0\n" + "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" + "vmov.i32 q5, #0\n" + "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" + "vmov.i32 q6, #0\n" ASM_PREFETCH("[%[a_ptr], #48]") "vmov.i32 q7, #0\n" ASM_PREFETCH("[%[b_ptr], #48]") "vmov.i32 q8, #0\n" ASM_PREFETCH("[%[a_ptr], #112]") "vmov.i32 q9, #0\n" + ASM_PREFETCH("[%[b_ptr], #112]") + "vmov.i32 q10, #0\n" + "vmov.i32 q11, #0\n" + "vmov.i32 q12, #0\n" + "vmov.i32 q13, #0\n" ASM_PREFETCH("[%[a_ptr], #176]") "vmov.i32 q14, #0\n" ASM_PREFETCH("[%[b_ptr], #176]") + "vmov.i32 q15, #0\n" + + "cmp %[k], #0\n" + "beq 6f\n" + + "1:\n" + // Unroll 0 + "vmla.f32 q4, q2, d0[0]\n" + "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" + "vmla.f32 q5, q2, d0[1]\n" + "vmla.f32 q6, q2, d1[0]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + "vmla.f32 q7, q2, d1[1]\n" + "vmla.f32 q8, q2, d2[0]\n" + "vmla.f32 q9, q2, d2[1]\n" + "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" + + "vmla.f32 q10, q3, d0[0]\n" + "vmla.f32 q11, q3, d0[1]\n" + "vmla.f32 q12, q3, d1[0]\n" + "vmla.f32 q13, q3, d1[1]\n" + "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" + "vmla.f32 q14, q3, d2[0]\n" + "vmla.f32 q15, q3, d2[1]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + + // Unroll 1 + "vmla.f32 q4, q2, d3[0]\n" + "subs %[k], %[k], #1\n" + "vmla.f32 q5, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #208]") + "vmla.f32 q6, q2, d0[0]\n" + "vmla.f32 q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #192]") + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q9, q2, d1[1]\n" + "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" + + "vmla.f32 q10, q3, d3[0]\n" + "vmla.f32 q11, q3, d3[1]\n" + "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" + "vmla.f32 q12, q3, d0[0]\n" + "vmla.f32 q13, q3, d0[1]\n" + "vmla.f32 q14, q3, d1[0]\n" + "vmla.f32 q15, q3, d1[1]\n" + "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" + + // Unroll 2 + "vmla.f32 q4, q2, d2[0]\n" + "vmla.f32 q5, q2, d2[1]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + "vmla.f32 q6, q2, d3[0]\n" + "vmla.f32 q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #240]") + "vmla.f32 q8, q2, d0[0]\n" + "vmla.f32 q9, q2, d0[1]\n" + "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" + + "vmla.f32 q10, q3, d2[0]\n" + "vmla.f32 q11, q3, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #208]") + "vmla.f32 q12, q3, d3[0]\n" + "vmla.f32 q13, q3, d3[1]\n" + "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" + "vmla.f32 q14, q3, d0[0]\n" + "vmla.f32 q15, q3, d0[1]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + + // Unroll 3 + "vmla.f32 q4, q2, d1[0]\n" + "vmla.f32 q5, q2, d1[1]\n" + "vmla.f32 q6, q2, d2[0]\n" + "vmla.f32 q7, q2, d2[1]\n" + "vmla.f32 q8, q2, d3[0]\n" + "vmla.f32 q9, q2, d3[1]\n" + "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" + + "vmla.f32 q10, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" + "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" + "vmla.f32 q12, q3, d2[0]\n" + "vmla.f32 q13, q3, d2[1]\n" + "vmla.f32 q14, q3, d3[0]\n" + "vmla.f32 q15, q3, d3[1]\n" + "bne 1b\n" + + // Branch here if we never execute main loop. + "6:\n" + + // "Tails" shows how many multiply blocks are needed at the + // end, must be 1-4 inclusive. Bail out to alternative tail + // immediately if it's 1. + "subs %[tails], %[tails], #1\n" + "beq 3f\n" + + // Detached final iteration + // Unroll 0 + "vmla.f32 q4, q2, d0[0]\n" + "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" + "vmla.f32 q5, q2, d0[1]\n" + "vmla.f32 q6, q2, d1[0]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + "vmla.f32 q7, q2, d1[1]\n" + "vmla.f32 q8, q2, d2[0]\n" + "subs %[tails], %[tails], #1\n" + "vmla.f32 q9, q2, d2[1]\n" + "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" + + "vmla.f32 q10, q3, d0[0]\n" + "vmla.f32 q11, q3, d0[1]\n" + "vmla.f32 q12, q3, d1[0]\n" + "vmla.f32 q13, q3, d1[1]\n" + "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" + "vmla.f32 q14, q3, d2[0]\n" + "vmla.f32 q15, q3, d2[1]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + "beq 4f\n" + + // Unroll 1 + "vmla.f32 q4, q2, d3[0]\n" + "vmla.f32 q5, q2, d3[1]\n" + "subs %[tails], %[tails], #1\n" + "vmla.f32 q6, q2, d0[0]\n" + "vmla.f32 q7, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q9, q2, d1[1]\n" + "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" + + "vmla.f32 q10, q3, d3[0]\n" + "vmla.f32 q11, q3, d3[1]\n" + "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" + "vmla.f32 q12, q3, d0[0]\n" + "vmla.f32 q13, q3, d0[1]\n" + "vmla.f32 q14, q3, d1[0]\n" + "vmla.f32 q15, q3, d1[1]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + "beq 5f\n" + + // Unroll 2 + "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n" + "vmla.f32 q4, q2, d2[0]\n" + "vmla.f32 q5, q2, d2[1]\n" + "vmla.f32 q6, q2, d3[0]\n" + "vmla.f32 q7, q2, d3[1]\n" + "vmla.f32 q8, q2, d0[0]\n" + "vmla.f32 q9, q2, d0[1]\n" + "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n" + + "vmla.f32 q10, q3, d2[0]\n" + "vmla.f32 q11, q3, d2[1]\n" + "vmla.f32 q12, q3, d3[0]\n" + "vmla.f32 q13, q3, d3[1]\n" + "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n" + "vmla.f32 q14, q3, d0[0]\n" + "vmla.f32 q15, q3, d0[1]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + + // Unroll 3 + "vmla.f32 q4, q2, d1[0]\n" + "vmla.f32 q10, q3, d1[0]\n" + "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" + "vmla.f32 q5, q2, d1[1]\n" + "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" + "vmla.f32 q11, q3, d1[1]\n" + "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" + "vmla.f32 q6, q2, d2[0]\n" + "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" + "vmla.f32 q12, q3, d2[0]\n" + "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" + "vmla.f32 q7, q2, d2[1]\n" + "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" + "vmla.f32 q13, q3, d2[1]\n" + "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" + "vmla.f32 q8, q2, d3[0]\n" + "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" + "vmla.f32 q14, q3, d3[0]\n" + "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" + "vmla.f32 q9, q2, d3[1]\n" + "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" + "vmla.f32 q15, q3, d3[1]\n" + "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" + "b 2f\n" + + // tails==1 final tail + "3:\n" + "vmla.f32 q4, q2, d0[0]\n" + "vld1.32 {d2}, [%[a_ptr] :64]!\n" + "vmla.f32 q5, q2, d0[1]\n" + "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n" + "vmla.f32 q6, q2, d1[0]\n" + "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" + "vmla.f32 q10, q3, d0[0]\n" + "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" + "vmla.f32 q11, q3, d0[1]\n" + "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" + "vmla.f32 q12, q3, d1[0]\n" + "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" + "vmla.f32 q7, q2, d1[1]\n" + "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" + "vmla.f32 q13, q3, d1[1]\n" + "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" + "vmla.f32 q8, q2, d2[0]\n" + "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" + "vmla.f32 q14, q3, d2[0]\n" + "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" + "vmla.f32 q9, q2, d2[1]\n" + "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" + "vmla.f32 q15, q3, d2[1]\n" + "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" + "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" + "b 2f\n" + + // tails==2 final tail + "4:\n" + "vmla.f32 q4, q2, d3[0]\n" + "vmla.f32 q10, q3, d3[0]\n" + "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" + "vmla.f32 q5, q2, d3[1]\n" + "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" + "vmla.f32 q11, q3, d3[1]\n" + "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" + "vmla.f32 q6, q2, d0[0]\n" + "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" + "vmla.f32 q12, q3, d0[0]\n" + "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" + "vmla.f32 q7, q2, d0[1]\n" + "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" + "vmla.f32 q13, q3, d0[1]\n" + "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" + "vmla.f32 q8, q2, d1[0]\n" + "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" + "vmla.f32 q14, q3, d1[0]\n" + "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" + "vmla.f32 q9, q2, d1[1]\n" + "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" + "vmla.f32 q15, q3, d1[1]\n" + "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" + "b 2f\n" + + // tails==3 final tail + "5:\n" + "vmla.f32 q4, q2, d2[0]\n" + "vld1.32 {d0}, [%[a_ptr] :64]!\n" + "vmla.f32 q5, q2, d2[1]\n" + "vmla.f32 q6, q2, d3[0]\n" + "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n" + "vmla.f32 q10, q3, d2[0]\n" + "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n" + "vmla.f32 q11, q3, d2[1]\n" + "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n" + "vmla.f32 q12, q3, d3[0]\n" + "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n" + "vmla.f32 q7, q2, d3[1]\n" + "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n" + "vmla.f32 q13, q3, d3[1]\n" + "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n" + "vmla.f32 q8, q2, d0[0]\n" + "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n" + "vmla.f32 q14, q3, d0[0]\n" + "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n" + "vmla.f32 q9, q2, d0[1]\n" + "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n" + "vmla.f32 q15, q3, d0[1]\n" + "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n" + "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n" + + "2:\n" + "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n" + : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc"); + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp new file mode 100644 index 0000000000..387f899b20 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, int, int); + +// 12x8 SGEMM "strategy" class. +// +// This describes the characteristics of a family of kernels, in terms of +// the required interleave properties and the output block size. +// +// All kernels in the family must share these characteristics. The actual +// kernel to be used can be chosen at runtime, based on the CPU_type +// structure. +class gemm_s16_12x8 +{ +public: + typedef int16_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 8; + static const int A_block = 1; + static const int A_transpose = 0; + + /* Same for B input */ + static const int B_interleave = 12; + static const int B_block = 1; + static const int B_transpose = 1; + + /* Kernel blocking parameters */ + static const int out_width = 12; + static const int out_height = 8; + static const int k_unroll = 1; + + kern_type kernel = a64_gemm_s16_asimd_12x8; + + gemm_s16_12x8(const CPUInfo *ci) + { + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp new file mode 100644 index 0000000000..b217dcf2cf --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +namespace arm_gemm +{ +void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) +{ + const int16_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + + for(int yb = 0; yb < ablocks; yb++) + { + const int16_t *a_ptr0 = a_ptr; + const int16_t *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + const bool odd_k = K & 0x1; + int k = (K + 1) / 2 - 1; + + register int16x8_t aa asm("v0"); + register int16x8_t ab asm("v1"); + register int16x8_t b0 asm("v2"); + register int16x8_t b1 asm("v3"); + register int16x8_t b2 asm("v4"); + + __asm __volatile( + "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower + "movi v5.4s, #0\n" + "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper + "movi v6.4s, #0\n" + "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower + "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper + "movi v7.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #64]") + "movi v8.4s, #0\n" + "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper + "movi v9.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #64]") + "movi v10.4s, #0\n" + "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower + "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper + "movi v11.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #96]") + "movi v12.4s, #0\n" + "movi v13.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #96]") + "movi v14.4s, #0\n" + "movi v15.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0\n" + "movi v17.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #128]") + "movi v18.4s, #0\n" + "movi v19.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #160]") + "movi v20.4s, #0\n" + "movi v21.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #160]") + "movi v22.4s, #0\n" + "movi v23.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #192]") + "movi v24.4s, #0\n" + "add %x[a_ptr], %x[a_ptr], #0x10\n" + "movi v25.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #192]") + "movi v26.4s, #0\n" + "add %x[b_ptr], %x[b_ptr], #0x18\n" + "movi v27.4s, #0\n" + "movi v28.4s, #0\n" + + "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations. + + "1:\n" // Main loop + // First unroll + "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper + "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower + "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper + "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper + "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower + "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper + "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper + "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower + "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper + "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper + "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" + + // Second unroll + "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" + "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower + "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper + "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" + "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" + "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper + "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" + "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" + "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" + "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" + "add %x[a_ptr], %x[a_ptr], #0x20\n" + "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" + "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" ASM_PREFETCH("[%[b_ptr], #320]") + "smlal v14.4s, %[b2].4h, %[ab].h[1]\n" + "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" ASM_PREFETCH("[%[a_ptr], #320]") + "smlal v16.4s, %[b2].4h, %[ab].h[3]\n" + "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" ASM_PREFETCH("[%[b_ptr], #448]") + "smlal v18.4s, %[b2].4h, %[ab].h[5]\n" + "smlal v19.4s, %[b2].4h, %[ab].h[6]\n" + "smlal v20.4s, %[b2].4h, %[ab].h[7]\n" + "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" + "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" + "subs %x[k], %x[k], #0x1\n" + "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" + "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" + "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower + "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper + "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" + "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" + "add %x[b_ptr], %x[b_ptr], #0x30\n" + "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" + "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" + "bne 1b\n" + + "2:\n" // Even tail + "cbnz %x[odd_k], 3f\n" + + "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper + "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower + "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper + "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper + "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower + "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper + "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper + "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "add %[a_ptr], %[a_ptr], #0x10\n" + "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "add %[b_ptr], %[b_ptr], #0x18\n" + "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper + "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" + + "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" + "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" + "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" + "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" + "smlal v14.4s, %[b2].4h, %[ab].h[1]\n" + "str q5, [%x[c_ptr]]\n" + "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" + "str q13, [%x[c_ptr], #0x10]\n" + "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" + "str q21, [%x[c_ptr], #0x20]\n" + "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" + "str q6, [%x[c_ptr], #0x30]\n" + "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" + "str q14, [%x[c_ptr], #0x40]\n" + "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" + "str q22, [%x[c_ptr], #0x50]\n" + "smlal v16.4s, %[b2].4h, %[ab].h[3]\n" + "str q7, [%x[c_ptr], #0x60]\n" + "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" + "str q15, [%x[c_ptr], #0x70]\n" + "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" + "str q23, [%x[c_ptr], #0x80]\n" + "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" + "str q8, [%x[c_ptr], #0x90]\n" + "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" + "str q16, [%x[c_ptr], #0xa0]\n" + "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" + "str q24, [%x[c_ptr], #0xb0]\n" + "smlal v18.4s, %[b2].4h, %[ab].h[5]\n" + "str q9, [%x[c_ptr], #0xc0]\n" + "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" + "str q17, [%x[c_ptr], #0xd0]\n" + "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" + "str q25, [%x[c_ptr], #0xe0]\n" + "smlal v19.4s, %[b2].4h, %[ab].h[6]\n" + "str q10, [%x[c_ptr], #0xf0]\n" + "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" + "str q18, [%x[c_ptr], #0x100]\n" + "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" + "str q26, [%x[c_ptr], #0x110]\n" + "smlal v20.4s, %[b2].4h, %[ab].h[7]\n" + "str q11, [%x[c_ptr], #0x120]\n" + "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" + "str q19, [%x[c_ptr], #0x130]\n" + "b 4f\n" // Complete write out + + "3:\n" // Odd tail + "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "str q5, [%x[c_ptr]]\n" + "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "str q13, [%x[c_ptr], #0x10]\n" + "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "str q21, [%x[c_ptr], #0x20]\n" + "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "str q6, [%x[c_ptr], #0x30]\n" + "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "str q14, [%x[c_ptr], #0x40]\n" + "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "str q22, [%x[c_ptr], #0x50]\n" + "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "str q7, [%x[c_ptr], #0x60]\n" + "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "str q15, [%x[c_ptr], #0x70]\n" + "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "str q23, [%x[c_ptr], #0x80]\n" + "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "str q8, [%x[c_ptr], #0x90]\n" + "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "str q16, [%x[c_ptr], #0xa0]\n" + "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "str q24, [%x[c_ptr], #0xb0]\n" + "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "str q9, [%x[c_ptr], #0xc0]\n" + "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "str q17, [%x[c_ptr], #0xd0]\n" + "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "str q25, [%x[c_ptr], #0xe0]\n" + "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "str q10, [%x[c_ptr], #0xf0]\n" + "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "str q18, [%x[c_ptr], #0x100]\n" + "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "str q26, [%x[c_ptr], #0x110]\n" + "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" + "str q11, [%x[c_ptr], #0x120]\n" + + "4:\n" // End of function + "str q19, [%x[c_ptr], #0x130]\n" + "str q27, [%x[c_ptr], #0x140]\n" + "str q12, [%x[c_ptr], #0x150]\n" + "str q20, [%x[c_ptr], #0x160]\n" + "str q28, [%x[c_ptr], #0x170]\n" + "add %x[c_ptr], %x[c_ptr], #0x180\n" + : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), + [aa] "+w"(aa), [ab] "+w"(ab), [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2) + : [odd_k] "r"(odd_k) + : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc"); + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp new file mode 100644 index 0000000000..08f90e16ed --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "arm_gemm.hpp" + +namespace arm_gemm +{ +// Load the actual kernel +void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int); +void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int); + +class gemm_s8_12x8 +{ +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 8; + static const int A_block = 4; + static const bool A_transpose = false; + + /* Same for B input */ + static const int B_interleave = 12; + static const int B_block = 4; + static const bool B_transpose = true; + + /* Kernel blocking parameters */ + static const int out_width = 12; + static const int out_height = 8; + static const int k_unroll = 4; + + kern_type kernel = a64_gemm_s8_12x8; + + gemm_s8_12x8(const CPUInfo *ci) + { + if(ci->get_cpu_model() == CPUModel::A55r1) + { + kernel = a64_gemm_s8_12x8_a55r1; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp new file mode 100644 index 0000000000..ef2f29183c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +#ifdef NO_DOT_IN_TOOLCHAIN +#include "dot_toolchain_support.h" +#endif + +namespace arm_gemm +{ +void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K) +{ + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + + // We divide K by 4 because the sdot instruction processes 4 elements at a time. + const int W = K / 4; + + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int k_iters = ((W + 1) / 2) - 1; + + for(int yb = 0; yb < ablocks; yb++) + { + const int8_t *a_ptr0 = a_ptr; + const int8_t *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + int k = k_iters; + + register int32x4_t a0 asm("v0"); + register int32x4_t a1 asm("v1"); + register int32x4_t b0 asm("v2"); + register int32x4_t b1 asm("v3"); + register int32x4_t b2 asm("v4"); + register int32x4_t a0a asm("v5"); + register int32x4_t a1a asm("v6"); + + __asm __volatile( +#ifdef NO_DOT_IN_TOOLCHAIN + _DECLARE_SDOT +#else + ".arch armv8.2-a+dotprod\n" +#endif + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]") + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]") + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]") + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]") + + // The loop is offset by these two instructions which must + // always be executed. + "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + "1:\n" + "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "subs %w[k], %w[k], #1\n" + "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %d[a0a], [%[a_ptr], #32]\n" + + "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "ins %[b2].d[1], x20\n" + "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %d[a1a], [%[a_ptr], #48]\n" + + "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "ins %[a0a].d[1], x20\n" + "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "ins %[a1a].d[1], x20\n" + "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "ins %[b0].d[1], x20\n" + "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCH("[%[a_ptr], #448]") + + "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #576]") + "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + + // Unroll 1 + "ldr %d[b2], [%[b_ptr], #80]\n" + + "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + "ins %[b1].d[1], x20\n" + "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "ldr %d[a0], [%[a_ptr], #64]\n" + + "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "ins %[b2].d[1], x20\n" + "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "ldr x20, [%[a_ptr], #72]\n" + "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "ldr %d[a1], [%[a_ptr], #80]\n" + + "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "ins %[a0].d[1], x20\n" + "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + "ldr x20, [%[a_ptr], #88]\n" + "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "ldr %d[b0], [%[b_ptr], #96]\n" + + "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "ins %[a1].d[1], x20\n" + "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "ldr x20, [%[b_ptr], #104]\n" + "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "ldr %d[b1], [%[b_ptr], #112]\n" + + "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "ins %[b0].d[1], x20\n" + "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "ldr x20, [%[b_ptr], #120]\n" + "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "add %[a_ptr], %[a_ptr], #64\n" + + "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCH("[%[b_ptr], #640]") + "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "ins %[b1].d[1], x20\n" + "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + + "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "b.ne 1b\n" + + // Branch here if K=1 or 2. Do the right thing for odd/even at the end. + "4:\n" + + // Start final iteration - branch off to "odd" code before we load a0a. + "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "cbnz %w[oddk], 2f\n" + + // Even K continuation + "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %d[a0a], [%[a_ptr], #32]\n" + + "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "ins %[b2].d[1], x20\n" + "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr]]") + "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %d[a1a], [%[a_ptr], #48]\n" + + "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "ins %[a0a].d[1], x20\n" + "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "ins %[a1a].d[1], x20\n" + "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]") + "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + + "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]") + "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "ins %[b0].d[1], x20\n" + "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]") + "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %d[b2], [%[b_ptr], #80]\n" + + "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + "ins %[b1].d[1], x20\n" + "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "ins %[b2].d[1], x20\n" + + "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]") + "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]") + "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]") + "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]") + "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") + "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") + "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]") + "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]") + "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "b 3f\n" + + // Odd K continuation + "2:\n" + "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr]]") + "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "ins %[b2].d[1], x20\n" + "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]") + "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]") + "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]") + "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]") + "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]") + "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]") + "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]") + "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + ASM_PREFETCHWL2("[%[c_ptr], #640]") "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]") + "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + + // Common tail + "3:\n" + "str q8, [%[c_ptr]]\n" + "str q16, [%[c_ptr], #16]\n" + "str q24, [%[c_ptr], #32]\n" + "str q9, [%[c_ptr], #48]\n" + "str q17, [%[c_ptr], #64]\n" + "str q25, [%[c_ptr], #80]\n" + "str q10, [%[c_ptr], #96]\n" + "str q18, [%[c_ptr], #112]\n" + "str q26, [%[c_ptr], #128]\n" + "str q11, [%[c_ptr], #144]\n" + "str q19, [%[c_ptr], #160]\n" + "str q27, [%[c_ptr], #176]\n" + "str q12, [%[c_ptr], #192]\n" + "str q20, [%[c_ptr], #208]\n" + "str q28, [%[c_ptr], #224]\n" + "str q13, [%[c_ptr], #240]\n" + "str q21, [%[c_ptr], #256]\n" + "str q29, [%[c_ptr], #272]\n" + "str q14, [%[c_ptr], #288]\n" + "str q22, [%[c_ptr], #304]\n" + "str q30, [%[c_ptr], #320]\n" + "str q15, [%[c_ptr], #336]\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + +#ifdef NO_DOT_IN_TOOLCHAIN + ".purgem sdot\n" +#endif + : + [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), + [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a), + [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k) + : [oddk] "r"(oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"); + } + } +} + +} // namespace arm_gemm + +#endif \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h new file mode 100644 index 0000000000..c76f99d776 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// Define a macro to assemble the UDOT instruction (in the absence of toolchain support) +#define _DECLARE_SDOT \ + ".altmacro\n" \ + ".macro sdot opd:req, opn:req, opm:req\n" \ + "local vd, vn, vm, h, l\n" \ + ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \ + ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n" \ + ".set vd,\\reg\n" \ + ".endif\n" \ + ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n" \ + ".set vn,\\reg\n" \ + ".endif\n" \ + ".irp idx,0,1,2,3\n" \ + ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n" \ + ".set vm,\\reg\n" \ + ".set h,\\idx / 2\n" \ + ".set l,\\idx %% 2\n" \ + ".endif\n" \ + ".endr\n" \ + ".endr\n" \ + ".ifndef vd\n" \ + ".error \"Bad operand \\opd\"\n" \ + ".exitm\n" \ + ".endif\n" \ + ".ifndef vn\n" \ + ".error \"Bad operand \\opn\"\n" \ + ".exitm\n" \ + ".endif\n" \ + ".ifndef vm\n" \ + ".error \"Bad operand \\opm\"\n" \ + ".exitm\n" \ + ".endif\n" \ + ".ifndef h\n" \ + ".error \"Bad operand \\opm\"\n" \ + ".exitm\n" \ + ".endif\n" \ + ".ifndef l\n" \ + ".error \"Bad operand \\opm\"\n" \ + ".exitm\n" \ + ".endif\n" \ + ".int 0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n" \ + ".endm\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp new file mode 100644 index 0000000000..258ef5e224 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +#ifdef NO_DOT_IN_TOOLCHAIN +#include "dot_toolchain_support.h" +#endif + +namespace arm_gemm +{ +void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) +{ + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + // We divide K by 4 because the sdot instruction processes 4 elements at a time. + const int W = K / 4; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int init_value_k = ((W + 1) / 2) - 1; + for(int yb = 0; yb < ablocks; yb++) + { + const int8_t *a_ptr0 = a_ptr; + const int8_t *b_ptr = Bpanel; + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + int k = init_value_k; + register int32x4_t a0 asm("v0"); + register int32x4_t a1 asm("v1"); + register int32x4_t b0 asm("v2"); + register int32x4_t b1 asm("v3"); + register int32x4_t b2 asm("v4"); + register int32x4_t a0a asm("v5"); + register int32x4_t a1a asm("v6"); + __asm __volatile( +#ifdef NO_DOT_IN_TOOLCHAIN + _DECLARE_SDOT +#else + ".arch armv8.2-a+dotprod\n" +#endif + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #384]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + // Loop proper + "1:\n" + "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + + "ldr %q[b2], [%[b_ptr], #32]\n" + "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %q[a0a], [%[a_ptr], #32]\n" + "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr %q[a1a], [%[a_ptr], #48]\n" + "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[a_ptr], #320]") + "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #448]") + "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + + "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "ldr %q[a0], [%[a_ptr], #64]\n" + "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "ldr %q[a1], [%[a_ptr], #80]\n" + "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #96]\n" + + "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #512]") + "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #112]\n" + + "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" + "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "subs %w[k], %w[k], #1\n" + "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "bne 1b\n" + + // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) + "4:\n" + + // Branch to alternative tail for odd K + "cbnz %w[oddk], 2f\n" + + // Detached final iteration (even K) + "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %q[a0a], [%[a_ptr], #32]\n" + "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr %q[a1a], [%[a_ptr], #48]\n" + "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + + "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + + "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "str q8, [%[c_ptr], #0]\n" + "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "str q24, [%[c_ptr], #32]\n" + + "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + "b 3f\n" + + // Detached final iteration (odd K) + "2:\n" + "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "str q8, [%[c_ptr], #0]\n" + "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "add %[a_ptr], %[a_ptr], #32\n" + "str q24, [%[c_ptr], #32]\n" + "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + + "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + // Common tail + "3:\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + +#ifdef NO_DOT_IN_TOOLCHAIN + ".purgem sdot\n" +#endif + : + [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), + [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a), + [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k) + : [oddk] "r"(oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"); + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp new file mode 100644 index 0000000000..2ec28f480c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +namespace arm_gemm +{ +// Load the actual kernel +void a64_gemm_s8_4x4(const int8_t *, const int8_t *, int32_t *, int, int, int); + +#include "arm_gemm.hpp" + +class gemm_s8_4x4 +{ +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 4; + static const int A_block = 16; + static const bool A_transpose = false; + + /* Same for B input */ + static const int B_interleave = 4; + static const int B_block = 16; + static const bool B_transpose = true; + + /* Kernel blocking parameters */ + static const int out_width = 4; + static const int out_height = 4; + static const int k_unroll = 16; + + kern_type kernel = a64_gemm_s8_4x4; + + gemm_s8_4x4(const CPUInfo *ci) + { + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp new file mode 100644 index 0000000000..243b94e25b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp @@ -0,0 +1,456 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +namespace arm_gemm +{ +void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) +{ + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + + K /= 16; + int oddk = (K & 1); + + for(int yb = 0; yb < ablocks; yb++) + { + const int8_t *a_ptr0 = a_ptr; + const int8_t *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + + int k = ((K + 1) / 2) - 1; + + register int8x16_t b0 asm("v4"); + register int8x16_t b1 asm("v5"); + register int8x16_t b2 asm("v6"); + register int8x16_t b3 asm("v7"); + register int8x16_t b0a asm("v8"); + register int8x16_t b1a asm("v9"); + register int8x16_t b2a asm("v10"); + register int8x16_t b3a asm("v11"); + + __asm __volatile( + "movi v16.4s, #0x0\n" + "ldr q0, [%[a_ptr]]\n" + "movi v17.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v18.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v19.4s, #0x0\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "movi v20.4s, #0x0\n" + "ldr %q[b3], [%[b_ptr], #48]\n" + "movi v21.4s, #0x0\n" + "ldr q1, [%[a_ptr], #16]\n" + "movi v22.4s, #0x0\n" + "ldr q2, [%[a_ptr], #32]\n" + "movi v23.4s, #0x0\n" + "ldr q3, [%[a_ptr], #48]\n" + "movi v24.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v26.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v27.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") "movi v28.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") "movi v30.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") + + // Loop structure optimized for A57 (after r0). + + // Unavoidably, the multiply will "dribble" if + // dual issued with an add. + + // Minimize the effect of this by making sure + // there are 2 adds to run under the dribbled + // multiply. + + // Pipeline in blocks of 8 multiplies - combine + // this iteration's multiplies with adds from + // the previous iteration. + + // So the first block doesn't have any adds to + // do - but because all the adds are at the + // start of the block it's only the first couple + // of multiplies that need to be pulled out. + + // Start of unroll 0 (first iteration) + "smull v12.8h, v0.8b, %[b0].8b\n" + "smull v13.8h, v0.8b, %[b1].8b\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + // Unroll 0 continuation (branch target) + "1:\n" + "smull v14.8h, v0.8b, %[b2].8b\n" + "subs %w[k], %w[k], #1\n" + "smull v15.8h, v0.8b, %[b3].8b\n" + "ldr %q[b0a], [%[b_ptr], #64]\n" + "smlal2 v12.8h, v0.16b, %[b0].16b\n" + "smlal2 v13.8h, v0.16b, %[b1].16b\n" + "ldr %q[b1a], [%[b_ptr], #80]\n" + "smlal2 v14.8h, v0.16b, %[b2].16b\n" + "smlal2 v15.8h, v0.16b, %[b3].16b\n" + "ldr q0, [%[a_ptr], #64]\n" + + "sadalp v16.4s, v12.8h\n" + "smull v12.8h, v1.8b, %[b0].8b\n" + "sadalp v17.4s, v13.8h\n" + "sadalp v18.4s, v14.8h\n" + "smull v13.8h, v1.8b, %[b1].8b\n" + "sadalp v19.4s, v15.8h\n" + "smull v14.8h, v1.8b, %[b2].8b\n" + "ldr %q[b2a], [%[b_ptr], #96]\n" + "smull v15.8h, v1.8b, %[b3].8b\n" + "smlal2 v12.8h, v1.16b, %[b0].16b\n" + "ldr %q[b3a], [%[b_ptr], #112]\n" + "smlal2 v13.8h, v1.16b, %[b1].16b\n" + "add %[b_ptr], %[b_ptr], #128\n" + "smlal2 v14.8h, v1.16b, %[b2].16b\n" + "smlal2 v15.8h, v1.16b, %[b3].16b\n" + "ldr q1, [%[a_ptr], #80]\n" + + "sadalp v20.4s, v12.8h\n" + "smull v12.8h, v2.8b, %[b0].8b\n" + "sadalp v21.4s, v13.8h\n" + "sadalp v22.4s, v14.8h\n" + "smull v13.8h, v2.8b, %[b1].8b\n" + "sadalp v23.4s, v15.8h\n" + "smull v14.8h, v2.8b, %[b2].8b\n" + "smull v15.8h, v2.8b, %[b3].8b\n" + "smlal2 v12.8h, v2.16b, %[b0].16b\n" ASM_PREFETCH("[%[b_ptr], #192]") + "smlal2 v13.8h, v2.16b, %[b1].16b\n" + "smlal2 v14.8h, v2.16b, %[b2].16b\n" ASM_PREFETCH("[%[a_ptr], #320]") + "smlal2 v15.8h, v2.16b, %[b3].16b\n" + "ldr q2, [%[a_ptr], #96]\n" + + "sadalp v24.4s, v12.8h\n" + "smull v12.8h, v3.8b, %[b0].8b\n" + "sadalp v25.4s, v13.8h\n" + "sadalp v26.4s, v14.8h\n" + "smull v13.8h, v3.8b, %[b1].8b\n" + "sadalp v27.4s, v15.8h\n" + "smull v14.8h, v3.8b, %[b2].8b\n" + "smull v15.8h, v3.8b, %[b3].8b\n" + "smlal2 v12.8h, v3.16b, %[b0].16b\n" + "ldr %q[b0], [%[b_ptr], #0]\n" + "smlal2 v13.8h, v3.16b, %[b1].16b\n" + "smlal2 v14.8h, v3.16b, %[b2].16b\n" + "smlal2 v15.8h, v3.16b, %[b3].16b\n" + "ldr q3, [%[a_ptr], #112]\n" + + // Unroll 1 + "sadalp v28.4s, v12.8h\n" + "smull v12.8h, v0.8b, %[b0a].8b\n" + "sadalp v29.4s, v13.8h\n" + "sadalp v30.4s, v14.8h\n" + "smull v13.8h, v0.8b, %[b1a].8b\n" + "sadalp v31.4s, v15.8h\n" + "smull v14.8h, v0.8b, %[b2a].8b\n" + "smull v15.8h, v0.8b, %[b3a].8b\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "smlal2 v12.8h, v0.16b, %[b0a].16b\n" + "smlal2 v13.8h, v0.16b, %[b1a].16b\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "smlal2 v14.8h, v0.16b, %[b2a].16b\n" + "smlal2 v15.8h, v0.16b, %[b3a].16b\n" + "ldr q0, [%[a_ptr], #128]\n" + + "sadalp v16.4s, v12.8h\n" + "smull v12.8h, v1.8b, %[b0a].8b\n" + "sadalp v17.4s, v13.8h\n" + "sadalp v18.4s, v14.8h\n" + "smull v13.8h, v1.8b, %[b1a].8b\n" + "sadalp v19.4s, v15.8h\n" + "add %[a_ptr], %[a_ptr], #128\n" + "smull v14.8h, v1.8b, %[b2a].8b\n" + "smull v15.8h, v1.8b, %[b3a].8b\n" + "ldr %q[b3], [%[b_ptr], #48]\n" + "smlal2 v12.8h, v1.16b, %[b0a].16b\n" + "smlal2 v13.8h, v1.16b, %[b1a].16b\n" + "smlal2 v14.8h, v1.16b, %[b2a].16b\n" + "smlal2 v15.8h, v1.16b, %[b3a].16b\n" + "ldr q1, [%[a_ptr], #16]\n" + + "sadalp v20.4s, v12.8h\n" + "smull v12.8h, v2.8b, %[b0a].8b\n" + "sadalp v21.4s, v13.8h\n" + "sadalp v22.4s, v14.8h\n" + "smull v13.8h, v2.8b, %[b1a].8b\n" + "sadalp v23.4s, v15.8h\n" + "smull v14.8h, v2.8b, %[b2a].8b\n" + "smull v15.8h, v2.8b, %[b3a].8b\n" + "smlal2 v12.8h, v2.16b, %[b0a].16b\n" ASM_PREFETCH("[%[b_ptr], #256]") + "smlal2 v13.8h, v2.16b, %[b1a].16b\n" + "smlal2 v14.8h, v2.16b, %[b2a].16b\n" ASM_PREFETCH("[%[a_ptr], #256]") + "smlal2 v15.8h, v2.16b, %[b3a].16b\n" + "ldr q2, [%[a_ptr], #32]\n" + + "sadalp v24.4s, v12.8h\n" + "smull v12.8h, v3.8b, %[b0a].8b\n" + "sadalp v25.4s, v13.8h\n" + "sadalp v26.4s, v14.8h\n" + "smull v13.8h, v3.8b, %[b1a].8b\n" + "sadalp v27.4s, v15.8h\n" + "smull v14.8h, v3.8b, %[b2a].8b\n" + "smull v15.8h, v3.8b, %[b3a].8b\n" + "smlal2 v12.8h, v3.16b, %[b0a].16b\n" + "smlal2 v13.8h, v3.16b, %[b1a].16b\n" + "smlal2 v14.8h, v3.16b, %[b2a].16b\n" + "smlal2 v15.8h, v3.16b, %[b3a].16b\n" + "ldr q3, [%[a_ptr], #48]\n" + + // Start of unroll 0 for next iteration. + "sadalp v28.4s, v12.8h\n" + "smull v12.8h, v0.8b, %[b0].8b\n" + "sadalp v29.4s, v13.8h\n" + "sadalp v30.4s, v14.8h\n" + "smull v13.8h, v0.8b, %[b1].8b\n" + "sadalp v31.4s, v15.8h\n" + "bne 1b\n" + + // Target to use when K=1 or 2 (i.e. zero iterations of main loop) + "4:\n" + + // Branch to alternative tail for odd K + "cbnz %w[oddk], 2f\n" + + // Detached final iteration (even K) + "smull v14.8h, v0.8b, %[b2].8b\n" + "smull v15.8h, v0.8b, %[b3].8b\n" + "ldr %q[b0a], [%[b_ptr], #64]\n" + "smlal2 v12.8h, v0.16b, %[b0].16b\n" + "smlal2 v13.8h, v0.16b, %[b1].16b\n" + "ldr %q[b1a], [%[b_ptr], #80]\n" + "smlal2 v14.8h, v0.16b, %[b2].16b\n" + "smlal2 v15.8h, v0.16b, %[b3].16b\n" + "ldr q0, [%[a_ptr], #64]\n" + + "sadalp v16.4s, v12.8h\n" + "smull v12.8h, v1.8b, %[b0].8b\n" + "sadalp v17.4s, v13.8h\n" + "sadalp v18.4s, v14.8h\n" + "smull v13.8h, v1.8b, %[b1].8b\n" + "sadalp v19.4s, v15.8h\n" + "smull v14.8h, v1.8b, %[b2].8b\n" + "ldr %q[b2a], [%[b_ptr], #96]\n" + "smull v15.8h, v1.8b, %[b3].8b\n" + "smlal2 v12.8h, v1.16b, %[b0].16b\n" + "ldr %q[b3a], [%[b_ptr], #112]\n" + "smlal2 v13.8h, v1.16b, %[b1].16b\n" + "add %[b_ptr], %[b_ptr], #128\n" + "smlal2 v14.8h, v1.16b, %[b2].16b\n" + "smlal2 v15.8h, v1.16b, %[b3].16b\n" + "ldr q1, [%[a_ptr], #80]\n" + + "sadalp v20.4s, v12.8h\n" + "smull v12.8h, v2.8b, %[b0].8b\n" + "sadalp v21.4s, v13.8h\n" + "sadalp v22.4s, v14.8h\n" + "smull v13.8h, v2.8b, %[b1].8b\n" + "sadalp v23.4s, v15.8h\n" + "smull v14.8h, v2.8b, %[b2].8b\n" + "smull v15.8h, v2.8b, %[b3].8b\n" + "smlal2 v12.8h, v2.16b, %[b0].16b\n" + "smlal2 v13.8h, v2.16b, %[b1].16b\n" + "smlal2 v14.8h, v2.16b, %[b2].16b\n" + "smlal2 v15.8h, v2.16b, %[b3].16b\n" + "ldr q2, [%[a_ptr], #96]\n" + + "sadalp v24.4s, v12.8h\n" + "smull v12.8h, v3.8b, %[b0].8b\n" + "sadalp v25.4s, v13.8h\n" + "sadalp v26.4s, v14.8h\n" + "smull v13.8h, v3.8b, %[b1].8b\n" + "sadalp v27.4s, v15.8h\n" + "smull v14.8h, v3.8b, %[b2].8b\n" + "smull v15.8h, v3.8b, %[b3].8b\n" + "smlal2 v12.8h, v3.16b, %[b0].16b\n" + "smlal2 v13.8h, v3.16b, %[b1].16b\n" + "smlal2 v14.8h, v3.16b, %[b2].16b\n" + "smlal2 v15.8h, v3.16b, %[b3].16b\n" + "ldr q3, [%[a_ptr], #112]\n" + + // Unroll 1 + "sadalp v28.4s, v12.8h\n" + "smull v12.8h, v0.8b, %[b0a].8b\n" + "sadalp v29.4s, v13.8h\n" + "sadalp v30.4s, v14.8h\n" + "smull v13.8h, v0.8b, %[b1a].8b\n" + "sadalp v31.4s, v15.8h\n" + "smull v14.8h, v0.8b, %[b2a].8b\n" + "add %[a_ptr], %[a_ptr], #128\n" + "smull v15.8h, v0.8b, %[b3a].8b\n" + "smlal2 v12.8h, v0.16b, %[b0a].16b\n" + "smlal2 v13.8h, v0.16b, %[b1a].16b\n" + "smlal2 v14.8h, v0.16b, %[b2a].16b\n" + "smlal2 v15.8h, v0.16b, %[b3a].16b\n" + + "sadalp v16.4s, v12.8h\n" + "smull v12.8h, v1.8b, %[b0a].8b\n" + "sadalp v17.4s, v13.8h\n" + "sadalp v18.4s, v14.8h\n" + "smull v13.8h, v1.8b, %[b1a].8b\n" + "sadalp v19.4s, v15.8h\n" + "smull v14.8h, v1.8b, %[b2a].8b\n" + "smull v15.8h, v1.8b, %[b3a].8b\n" + "smlal2 v12.8h, v1.16b, %[b0a].16b\n" + "addp v16.4s, v16.4s, v17.4s\n" + "smlal2 v13.8h, v1.16b, %[b1a].16b\n" + "addp v17.4s, v18.4s, v19.4s\n" + "smlal2 v14.8h, v1.16b, %[b2a].16b\n" + "smlal2 v15.8h, v1.16b, %[b3a].16b\n" + + "sadalp v20.4s, v12.8h\n" + "smull v12.8h, v2.8b, %[b0a].8b\n" + "sadalp v21.4s, v13.8h\n" + "sadalp v22.4s, v14.8h\n" + "smull v13.8h, v2.8b, %[b1a].8b\n" + "sadalp v23.4s, v15.8h\n" + "addp v16.4s, v16.4s, v17.4s\n" + "smull v14.8h, v2.8b, %[b2a].8b\n" + "addp v18.4s, v20.4s, v21.4s\n" + "addp v19.4s, v22.4s, v23.4s\n" + "smull v15.8h, v2.8b, %[b3a].8b\n" + "smlal2 v12.8h, v2.16b, %[b0a].16b\n" + "str q16, [%[c_ptr]]\n" + "smlal2 v13.8h, v2.16b, %[b1a].16b\n" + "smlal2 v14.8h, v2.16b, %[b2a].16b\n" + "smlal2 v15.8h, v2.16b, %[b3a].16b\n" + + "sadalp v24.4s, v12.8h\n" + "smull v12.8h, v3.8b, %[b0a].8b\n" + "sadalp v25.4s, v13.8h\n" + "sadalp v26.4s, v14.8h\n" + "smull v13.8h, v3.8b, %[b1a].8b\n" + "sadalp v27.4s, v15.8h\n" + "addp v17.4s, v18.4s, v19.4s\n" + "smull v14.8h, v3.8b, %[b2a].8b\n" + "addp v20.4s, v24.4s, v25.4s\n" + "addp v21.4s, v26.4s, v27.4s\n" + "smull v15.8h, v3.8b, %[b3a].8b\n" + "smlal2 v12.8h, v3.16b, %[b0a].16b\n" + "str q17, [%[c_ptr], #16]\n" + "smlal2 v13.8h, v3.16b, %[b1a].16b\n" + "smlal2 v14.8h, v3.16b, %[b2a].16b\n" + "addp v18.4s, v20.4s, v21.4s\n" + "smlal2 v15.8h, v3.16b, %[b3a].16b\n" + "b 3f\n" + + // Detached final iteration (odd K) + "2:\n" + "smull v14.8h, v0.8b, %[b2].8b\n" + "add %[a_ptr], %[a_ptr], #64\n" + "smull v15.8h, v0.8b, %[b3].8b\n" + "add %[b_ptr], %[b_ptr], #64\n" + "smlal2 v12.8h, v0.16b, %[b0].16b\n" + "smlal2 v13.8h, v0.16b, %[b1].16b\n" + "smlal2 v14.8h, v0.16b, %[b2].16b\n" + "smlal2 v15.8h, v0.16b, %[b3].16b\n" + + "sadalp v16.4s, v12.8h\n" + "smull v12.8h, v1.8b, %[b0].8b\n" + "sadalp v17.4s, v13.8h\n" + "sadalp v18.4s, v14.8h\n" + "smull v13.8h, v1.8b, %[b1].8b\n" + "sadalp v19.4s, v15.8h\n" + "smull v14.8h, v1.8b, %[b2].8b\n" + "smull v15.8h, v1.8b, %[b3].8b\n" + "smlal2 v12.8h, v1.16b, %[b0].16b\n" + "addp v16.4s, v16.4s, v17.4s\n" + "smlal2 v13.8h, v1.16b, %[b1].16b\n" + "addp v17.4s, v18.4s, v19.4s\n" + "smlal2 v14.8h, v1.16b, %[b2].16b\n" + "smlal2 v15.8h, v1.16b, %[b3].16b\n" + + "sadalp v20.4s, v12.8h\n" + "smull v12.8h, v2.8b, %[b0].8b\n" + "sadalp v21.4s, v13.8h\n" + "sadalp v22.4s, v14.8h\n" + "smull v13.8h, v2.8b, %[b1].8b\n" + "sadalp v23.4s, v15.8h\n" + "addp v16.4s, v16.4s, v17.4s\n" + "smull v14.8h, v2.8b, %[b2].8b\n" + "addp v18.4s, v20.4s, v21.4s\n" + "addp v19.4s, v22.4s, v23.4s\n" + "smull v15.8h, v2.8b, %[b3].8b\n" + "smlal2 v12.8h, v2.16b, %[b0].16b\n" + "str q16, [%[c_ptr]]\n" + "smlal2 v13.8h, v2.16b, %[b1].16b\n" + "smlal2 v14.8h, v2.16b, %[b2].16b\n" + "smlal2 v15.8h, v2.16b, %[b3].16b\n" + + "sadalp v24.4s, v12.8h\n" + "smull v12.8h, v3.8b, %[b0].8b\n" + "sadalp v25.4s, v13.8h\n" + "sadalp v26.4s, v14.8h\n" + "smull v13.8h, v3.8b, %[b1].8b\n" + "sadalp v27.4s, v15.8h\n" + "addp v17.4s, v18.4s, v19.4s\n" + "smull v14.8h, v3.8b, %[b2].8b\n" + "addp v20.4s, v24.4s, v25.4s\n" + "addp v21.4s, v26.4s, v27.4s\n" + "smull v15.8h, v3.8b, %[b3].8b\n" + "smlal2 v12.8h, v3.16b, %[b0].16b\n" + "str q17, [%[c_ptr], #16]\n" + "smlal2 v13.8h, v3.16b, %[b1].16b\n" + "smlal2 v14.8h, v3.16b, %[b2].16b\n" + "addp v18.4s, v20.4s, v21.4s\n" + "smlal2 v15.8h, v3.16b, %[b3].16b\n" + + "3:\n" + + // Final additions + "sadalp v28.4s, v12.8h\n" + "str q18, [%[c_ptr], #32]\n" + "sadalp v29.4s, v13.8h\n" + "sadalp v30.4s, v14.8h\n" + "sadalp v31.4s, v15.8h\n" + + // Horizontal reduction, phase 1 + "addp v22.4s, v28.4s, v29.4s\n" + "addp v23.4s, v30.4s, v31.4s\n" + + // Horizontal reduction, phase 2 + "addp v19.4s, v22.4s, v23.4s\n" + "str q19, [%[c_ptr], #48]\n" + "add %[c_ptr], %[c_ptr], #64\n" + + : + [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), + [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [b3] "+w"(b3), + [b0a] "+w"(b0a), [b1a] "+w"(b1a), [b2a] "+w"(b2a), [b3a] "+w"(b3a), + [k] "+r"(k) + : [oddk] "r"(oddk) + : "x20", "x21", "v0", "v1", "v2", "v3", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"); + } + } +} + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp new file mode 100644 index 0000000000..39757326f4 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_gemm_u16_asimd_12x8(const uint16_t *, const uint16_t *, uint32_t *, int, int, int); + +// 12x8 SGEMM "strategy" class. +// +// This describes the characteristics of a family of kernels, in terms of +// the required interleave properties and the output block size. +// +// All kernels in the family must share these characteristics. The actual +// kernel to be used can be chosen at runtime, based on the CPU_type +// structure. +class gemm_u16_12x8 +{ +public: + typedef uint16_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 8; + static const int A_block = 1; + static const int A_transpose = 0; + + /* Same for B input */ + static const int B_interleave = 12; + static const int B_block = 1; + static const int B_transpose = 1; + + /* Kernel blocking parameters */ + static const int out_width = 12; + static const int out_height = 8; + static const int k_unroll = 1; + + kern_type kernel = a64_gemm_u16_asimd_12x8; + + gemm_u16_12x8(const CPUInfo *ci) + { + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp new file mode 100644 index 0000000000..7903878301 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +namespace arm_gemm +{ +void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) +{ + const uint16_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + + for(int yb = 0; yb < ablocks; yb++) + { + const uint16_t *a_ptr0 = a_ptr; + const uint16_t *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + const bool odd_k = K & 0x1; + int k = (K + 1) / 2 - 1; + + register uint16x8_t aa asm("v0"); + register uint16x8_t ab asm("v1"); + register uint16x8_t b0 asm("v2"); + register uint16x8_t b1 asm("v3"); + register uint16x8_t b2 asm("v4"); + + __asm __volatile( + "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower + "movi v5.4s, #0\n" + "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper + "movi v6.4s, #0\n" + "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower + "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper + "movi v7.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #64]") + "movi v8.4s, #0\n" + "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper + "movi v9.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #64]") + "movi v10.4s, #0\n" + "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower + "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper + "movi v11.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #96]") + "movi v12.4s, #0\n" + "movi v13.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #96]") + "movi v14.4s, #0\n" + "movi v15.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0\n" + "movi v17.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #128]") + "movi v18.4s, #0\n" + "movi v19.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #160]") + "movi v20.4s, #0\n" + "movi v21.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #160]") + "movi v22.4s, #0\n" + "movi v23.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #192]") + "movi v24.4s, #0\n" + "add %x[a_ptr], %x[a_ptr], #0x10\n" + "movi v25.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #192]") + "movi v26.4s, #0\n" + "add %x[b_ptr], %x[b_ptr], #0x18\n" + "movi v27.4s, #0\n" + "movi v28.4s, #0\n" + + "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations. + + "1:\n" // Main loop + // First unroll + "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper + "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower + "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper + "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper + "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower + "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper + "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper + "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower + "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper + "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper + "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" + + // Second unroll + "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" + "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower + "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper + "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" + "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" + "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper + "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" + "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" + "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" + "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" + "add %x[a_ptr], %x[a_ptr], #0x20\n" + "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" + "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" ASM_PREFETCH("[%[b_ptr], #320]") + "umlal v14.4s, %[b2].4h, %[ab].h[1]\n" + "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" ASM_PREFETCH("[%[a_ptr], #320]") + "umlal v16.4s, %[b2].4h, %[ab].h[3]\n" + "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" ASM_PREFETCH("[%[b_ptr], #448]") + "umlal v18.4s, %[b2].4h, %[ab].h[5]\n" + "umlal v19.4s, %[b2].4h, %[ab].h[6]\n" + "umlal v20.4s, %[b2].4h, %[ab].h[7]\n" + "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" + "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" + "subs %x[k], %x[k], #0x1\n" + "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" + "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" + "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower + "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper + "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" + "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" + "add %x[b_ptr], %x[b_ptr], #0x30\n" + "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" + "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" + "bne 1b\n" + + "2:\n" // Even tail + "cbnz %x[odd_k], 3f\n" + + "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper + "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower + "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper + "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper + "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower + "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper + "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper + "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "add %[a_ptr], %[a_ptr], #0x10\n" + "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "add %[b_ptr], %[b_ptr], #0x18\n" + "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper + "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" + + "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" + "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" + "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" + "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" + "umlal v14.4s, %[b2].4h, %[ab].h[1]\n" + "str q5, [%x[c_ptr]]\n" + "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" + "str q13, [%x[c_ptr], #0x10]\n" + "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" + "str q21, [%x[c_ptr], #0x20]\n" + "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" + "str q6, [%x[c_ptr], #0x30]\n" + "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" + "str q14, [%x[c_ptr], #0x40]\n" + "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" + "str q22, [%x[c_ptr], #0x50]\n" + "umlal v16.4s, %[b2].4h, %[ab].h[3]\n" + "str q7, [%x[c_ptr], #0x60]\n" + "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" + "str q15, [%x[c_ptr], #0x70]\n" + "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" + "str q23, [%x[c_ptr], #0x80]\n" + "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" + "str q8, [%x[c_ptr], #0x90]\n" + "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" + "str q16, [%x[c_ptr], #0xa0]\n" + "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" + "str q24, [%x[c_ptr], #0xb0]\n" + "umlal v18.4s, %[b2].4h, %[ab].h[5]\n" + "str q9, [%x[c_ptr], #0xc0]\n" + "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" + "str q17, [%x[c_ptr], #0xd0]\n" + "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" + "str q25, [%x[c_ptr], #0xe0]\n" + "umlal v19.4s, %[b2].4h, %[ab].h[6]\n" + "str q10, [%x[c_ptr], #0xf0]\n" + "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" + "str q18, [%x[c_ptr], #0x100]\n" + "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" + "str q26, [%x[c_ptr], #0x110]\n" + "umlal v20.4s, %[b2].4h, %[ab].h[7]\n" + "str q11, [%x[c_ptr], #0x120]\n" + "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" + "str q19, [%x[c_ptr], #0x130]\n" + "b 4f\n" // Complete write out + + "3:\n" // Odd tail + "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "str q5, [%x[c_ptr]]\n" + "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "str q13, [%x[c_ptr], #0x10]\n" + "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "str q21, [%x[c_ptr], #0x20]\n" + "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "str q6, [%x[c_ptr], #0x30]\n" + "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "str q14, [%x[c_ptr], #0x40]\n" + "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "str q22, [%x[c_ptr], #0x50]\n" + "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "str q7, [%x[c_ptr], #0x60]\n" + "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "str q15, [%x[c_ptr], #0x70]\n" + "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "str q23, [%x[c_ptr], #0x80]\n" + "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "str q8, [%x[c_ptr], #0x90]\n" + "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "str q16, [%x[c_ptr], #0xa0]\n" + "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "str q24, [%x[c_ptr], #0xb0]\n" + "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "str q9, [%x[c_ptr], #0xc0]\n" + "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "str q17, [%x[c_ptr], #0xd0]\n" + "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "str q25, [%x[c_ptr], #0xe0]\n" + "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "str q10, [%x[c_ptr], #0xf0]\n" + "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "str q18, [%x[c_ptr], #0x100]\n" + "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "str q26, [%x[c_ptr], #0x110]\n" + "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" + "str q11, [%x[c_ptr], #0x120]\n" + + "4:\n" // End of function + "str q19, [%x[c_ptr], #0x130]\n" + "str q27, [%x[c_ptr], #0x140]\n" + "str q12, [%x[c_ptr], #0x150]\n" + "str q20, [%x[c_ptr], #0x160]\n" + "str q28, [%x[c_ptr], #0x170]\n" + "add %x[c_ptr], %x[c_ptr], #0x180\n" + : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), + [aa] "+w"(aa), [ab] "+w"(ab), [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2) + : [odd_k] "r"(odd_k) + : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc"); + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp new file mode 100644 index 0000000000..26255b14bf --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "arm_gemm.hpp" + +namespace arm_gemm +{ +// Load the actual kernel +void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); +void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + +class gemm_u8_12x8 +{ +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 8; + static const int A_block = 4; + static const bool A_transpose = false; + + /* Same for B input */ + static const int B_interleave = 12; + static const int B_block = 4; + static const bool B_transpose = true; + + /* Kernel blocking parameters */ + static const int out_width = 12; + static const int out_height = 8; + static const int k_unroll = 4; + + kern_type kernel = a64_gemm_u8_12x8; + + gemm_u8_12x8(const CPUInfo *ci) + { + if(ci->get_cpu_model() == CPUModel::A55r1) + { + kernel = a64_gemm_u8_12x8_a55r1; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp new file mode 100644 index 0000000000..f8fafbdf84 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +#ifdef NO_DOT_IN_TOOLCHAIN +#include "dot_toolchain_support.h" +#endif + +namespace arm_gemm +{ +void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K) +{ + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + + // We divide K by 4 because the udot instruction processes 4 elements at a time. + const int W = K / 4; + + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int k_iters = ((W + 1) / 2) - 1; + + for(int yb = 0; yb < ablocks; yb++) + { + const uint8_t *a_ptr0 = a_ptr; + const uint8_t *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + int k = k_iters; + + register int32x4_t a0 asm("v0"); + register int32x4_t a1 asm("v1"); + register int32x4_t b0 asm("v2"); + register int32x4_t b1 asm("v3"); + register int32x4_t b2 asm("v4"); + register int32x4_t a0a asm("v5"); + register int32x4_t a1a asm("v6"); + + __asm __volatile( +#ifdef NO_DOT_IN_TOOLCHAIN + _DECLARE_UDOT +#else + ".arch armv8.2-a+dotprod\n" +#endif + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]") + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]") + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]") + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]") + + // The loop is offset by these two instructions which must + // always be executed. + "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + "1:\n" + "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "subs %w[k], %w[k], #1\n" + "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %d[a0a], [%[a_ptr], #32]\n" + + "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "ins %[b2].d[1], x20\n" + "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %d[a1a], [%[a_ptr], #48]\n" + + "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "ins %[a0a].d[1], x20\n" + "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "ins %[a1a].d[1], x20\n" + "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "ins %[b0].d[1], x20\n" + "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCH("[%[a_ptr], #448]") + + "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #576]") + "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + + // Unroll 1 + "ldr %d[b2], [%[b_ptr], #80]\n" + + "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + "ins %[b1].d[1], x20\n" + "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "ldr %d[a0], [%[a_ptr], #64]\n" + + "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "ins %[b2].d[1], x20\n" + "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "ldr x20, [%[a_ptr], #72]\n" + "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "ldr %d[a1], [%[a_ptr], #80]\n" + + "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "ins %[a0].d[1], x20\n" + "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + "ldr x20, [%[a_ptr], #88]\n" + "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "ldr %d[b0], [%[b_ptr], #96]\n" + + "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "ins %[a1].d[1], x20\n" + "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "ldr x20, [%[b_ptr], #104]\n" + "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "ldr %d[b1], [%[b_ptr], #112]\n" + + "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "ins %[b0].d[1], x20\n" + "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "ldr x20, [%[b_ptr], #120]\n" + "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "add %[a_ptr], %[a_ptr], #64\n" + + "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCH("[%[b_ptr], #640]") + "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "ins %[b1].d[1], x20\n" + "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + + "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "b.ne 1b\n" + + // Branch here if K=1 or 2. Do the right thing for odd/even at the end. + "4:\n" + + // Start final iteration - branch off to "odd" code before we load a0a. + "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "cbnz %w[oddk], 2f\n" + + // Even K continuation + "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %d[a0a], [%[a_ptr], #32]\n" + + "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "ins %[b2].d[1], x20\n" + "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr]]") + "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %d[a1a], [%[a_ptr], #48]\n" + + "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "ins %[a0a].d[1], x20\n" + "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "ins %[a1a].d[1], x20\n" + "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]") + "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + + "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]") + "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "ins %[b0].d[1], x20\n" + "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]") + "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %d[b2], [%[b_ptr], #80]\n" + + "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + "ins %[b1].d[1], x20\n" + "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "ins %[b2].d[1], x20\n" + + "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]") + "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]") + "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]") + "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]") + "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") + "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") + "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]") + "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]") + "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "b 3f\n" + + // Odd K continuation + "2:\n" + "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr]]") + "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "ins %[b2].d[1], x20\n" + "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]") + "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]") + "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]") + "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]") + "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]") + "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]") + "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]") + "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + ASM_PREFETCHWL2("[%[c_ptr], #640]") "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]") + "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + + // Common tail + "3:\n" + "str q8, [%[c_ptr]]\n" + "str q16, [%[c_ptr], #16]\n" + "str q24, [%[c_ptr], #32]\n" + "str q9, [%[c_ptr], #48]\n" + "str q17, [%[c_ptr], #64]\n" + "str q25, [%[c_ptr], #80]\n" + "str q10, [%[c_ptr], #96]\n" + "str q18, [%[c_ptr], #112]\n" + "str q26, [%[c_ptr], #128]\n" + "str q11, [%[c_ptr], #144]\n" + "str q19, [%[c_ptr], #160]\n" + "str q27, [%[c_ptr], #176]\n" + "str q12, [%[c_ptr], #192]\n" + "str q20, [%[c_ptr], #208]\n" + "str q28, [%[c_ptr], #224]\n" + "str q13, [%[c_ptr], #240]\n" + "str q21, [%[c_ptr], #256]\n" + "str q29, [%[c_ptr], #272]\n" + "str q14, [%[c_ptr], #288]\n" + "str q22, [%[c_ptr], #304]\n" + "str q30, [%[c_ptr], #320]\n" + "str q15, [%[c_ptr], #336]\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + +#ifdef NO_DOT_IN_TOOLCHAIN + ".purgem udot\n" +#endif + : + [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), + [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a), + [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k) + : [oddk] "r"(oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"); + } + } +} + +} // namespace arm_gemm + +#endif \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h new file mode 100644 index 0000000000..5ee273bd74 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// Define a macro to assemble the UDOT instruction (in the absence of toolchain support) +#define _DECLARE_UDOT \ + ".altmacro\n" \ + ".macro udot opd:req, opn:req, opm:req\n" \ + "local vd, vn, vm, h, l\n" \ + ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \ + ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n" \ + ".set vd,\\reg\n" \ + ".endif\n" \ + ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n" \ + ".set vn,\\reg\n" \ + ".endif\n" \ + ".irp idx,0,1,2,3\n" \ + ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n" \ + ".set vm,\\reg\n" \ + ".set h,\\idx / 2\n" \ + ".set l,\\idx %% 2\n" \ + ".endif\n" \ + ".endr\n" \ + ".endr\n" \ + ".ifndef vd\n" \ + ".error \"Bad operand \\opd\"\n" \ + ".exitm\n" \ + ".endif\n" \ + ".ifndef vn\n" \ + ".error \"Bad operand \\opn\"\n" \ + ".exitm\n" \ + ".endif\n" \ + ".ifndef vm\n" \ + ".error \"Bad operand \\opm\"\n" \ + ".exitm\n" \ + ".endif\n" \ + ".ifndef h\n" \ + ".error \"Bad operand \\opm\"\n" \ + ".exitm\n" \ + ".endif\n" \ + ".ifndef l\n" \ + ".error \"Bad operand \\opm\"\n" \ + ".exitm\n" \ + ".endif\n" \ + ".int 0x6f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n" \ + ".endm\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp new file mode 100644 index 0000000000..d026dc54f3 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +#ifdef NO_DOT_IN_TOOLCHAIN +#include "dot_toolchain_support.h" +#endif + +namespace arm_gemm +{ +void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) +{ + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + // We divide K by 4 because the udot instruction processes 4 elements at a time. + const int W = K / 4; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int init_value_k = ((W + 1) / 2) - 1; + for(int yb = 0; yb < ablocks; yb++) + { + const uint8_t *a_ptr0 = a_ptr; + const uint8_t *b_ptr = Bpanel; + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + int k = init_value_k; + register uint8x16_t a0 asm("v0"); + register uint8x16_t a1 asm("v1"); + register uint8x16_t b0 asm("v2"); + register uint8x16_t b1 asm("v3"); + register uint8x16_t b2 asm("v4"); + register uint8x16_t a0a asm("v5"); + register uint8x16_t a1a asm("v6"); + __asm __volatile( +#ifdef NO_DOT_IN_TOOLCHAIN + _DECLARE_UDOT +#else + ".arch armv8.2-a+dotprod\n" +#endif + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #384]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + // Loop proper + "1:\n" + "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + + "ldr %q[b2], [%[b_ptr], #32]\n" + "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %q[a0a], [%[a_ptr], #32]\n" + "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr %q[a1a], [%[a_ptr], #48]\n" + "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[a_ptr], #320]") + "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #448]") + "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + + "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "ldr %q[a0], [%[a_ptr], #64]\n" + "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "ldr %q[a1], [%[a_ptr], #80]\n" + "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #96]\n" + + "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #512]") + "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #112]\n" + + "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" + "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "subs %w[k], %w[k], #1\n" + "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "bne 1b\n" + + // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) + "4:\n" + + // Branch to alternative tail for odd K + "cbnz %w[oddk], 2f\n" + + // Detached final iteration (even K) + "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %q[a0a], [%[a_ptr], #32]\n" + "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr %q[a1a], [%[a_ptr], #48]\n" + "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + + "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + + "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "str q8, [%[c_ptr], #0]\n" + "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "str q24, [%[c_ptr], #32]\n" + + "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + "b 3f\n" + + // Detached final iteration (odd K) + "2:\n" + "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "str q8, [%[c_ptr], #0]\n" + "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "add %[a_ptr], %[a_ptr], #32\n" + "str q24, [%[c_ptr], #32]\n" + "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + + "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + // Common tail + "3:\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + +#ifdef NO_DOT_IN_TOOLCHAIN + ".purgem udot\n" +#endif + : + [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), + [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a), + [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k) + : [oddk] "r"(oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"); + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp new file mode 100644 index 0000000000..5aa5291a29 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +namespace arm_gemm +{ +// Kernel definition +void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K); + +class gemm_u8_4x4 +{ +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 4; + static const int A_block = 16; + static const bool A_transpose = false; + + /* Same for B input */ + static const int B_interleave = 4; + static const int B_block = 16; + static const bool B_transpose = true; + + /* Kernel blocking parameters */ + static const int out_width = 4; + static const int out_height = 4; + static const int k_unroll = 16; + + kern_type kernel = nullptr; + + gemm_u8_4x4(const CPUInfo *ci) + { + kernel = a64_gemm_u8_4x4; + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp new file mode 100644 index 0000000000..0a881ffde3 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +namespace arm_gemm +{ +void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) +{ + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + K /= 16; + + for(int yb = 0; yb < ablocks; yb++) + { + const uint8_t *a_ptr0 = a_ptr; + const uint8_t *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + + int k = K - 1; + + register uint8x16_t b0 asm("v4"); + register uint8x16_t b1 asm("v5"); + register uint8x16_t b2 asm("v6"); + register uint8x16_t b3 asm("v7"); + + __asm __volatile( + "movi v16.4s, #0x0\n" + "ldr q0, [%[a_ptr]]\n" + "movi v17.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v18.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v19.4s, #0x0\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "movi v20.4s, #0x0\n" + "ldr %q[b3], [%[b_ptr], #48]\n" + "movi v21.4s, #0x0\n" + "ldr q1, [%[a_ptr], #16]\n" + "movi v22.4s, #0x0\n" + "ldr q2, [%[a_ptr], #32]\n" + "movi v23.4s, #0x0\n" + "ldr q3, [%[a_ptr], #48]\n" + "movi v24.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v26.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v27.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") "movi v28.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") "movi v30.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") + + "umull v12.8h, v0.8b, %[b0].8b\n" + "add %[a_ptr], %[a_ptr], #64\n" + "umull v13.8h, v0.8b, %[b1].8b\n" + "umull v14.8h, v0.8b, %[b2].8b\n" + "add %[b_ptr], %[b_ptr], #64\n" + "umull v15.8h, v0.8b, %[b3].8b\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 2f\n" + + "1:\n" + "uadalp v16.4s, v12.8h\n" + "umull2 v12.8h, v0.16b, %[b0].16b\n" + "uadalp v17.4s, v13.8h\n" + "umull2 v13.8h, v0.16b, %[b1].16b\n" + "uadalp v18.4s, v14.8h\n" + "umull2 v14.8h, v0.16b, %[b2].16b\n" + "uadalp v19.4s, v15.8h\n" + "umull2 v15.8h, v0.16b, %[b3].16b\n" + "ldr q0, [%[a_ptr]]\n" + + "uadalp v16.4s, v12.8h\n" + "umull v12.8h, v1.8b, %[b0].8b\n" + "uadalp v17.4s, v13.8h\n" + "umull v13.8h, v1.8b, %[b1].8b\n" + "subs %w[k], %w[k], #1\n" + "uadalp v18.4s, v14.8h\n" + "umull v14.8h, v1.8b, %[b2].8b\n" + "uadalp v19.4s, v15.8h\n" + "umull v15.8h, v1.8b, %[b3].8b\n" + + "uadalp v20.4s, v12.8h\n" + "umull2 v12.8h, v1.16b, %[b0].16b\n" + "uadalp v21.4s, v13.8h\n" + "umull2 v13.8h, v1.16b, %[b1].16b\n" ASM_PREFETCH("[%[a_ptr], #256]") + "uadalp v22.4s, v14.8h\n" + "umull2 v14.8h, v1.16b, %[b2].16b\n" + "uadalp v23.4s, v15.8h\n" + "umull2 v15.8h, v1.16b, %[b3].16b\n" + "ldr q1, [%[a_ptr], #16]\n" + + "uadalp v20.4s, v12.8h\n" + "umull v12.8h, v2.8b, %[b0].8b\n" + "uadalp v21.4s, v13.8h\n" + "umull v13.8h, v2.8b, %[b1].8b\n" ASM_PREFETCH("[%[b_ptr], #256]") + "uadalp v22.4s, v14.8h\n" + "umull v14.8h, v2.8b, %[b2].8b\n" + "uadalp v23.4s, v15.8h\n" + "umull v15.8h, v2.8b, %[b3].8b\n" + + "uadalp v24.4s, v12.8h\n" + "umull2 v12.8h, v2.16b, %[b0].16b\n" + "uadalp v25.4s, v13.8h\n" + "umull2 v13.8h, v2.16b, %[b1].16b\n" + "uadalp v26.4s, v14.8h\n" + "umull2 v14.8h, v2.16b, %[b2].16b\n" + "uadalp v27.4s, v15.8h\n" + "umull2 v15.8h, v2.16b, %[b3].16b\n" + "ldr q2, [%[a_ptr], #32]\n" + + "uadalp v24.4s, v12.8h\n" + "umull v12.8h, v3.8b, %[b0].8b\n" + "uadalp v25.4s, v13.8h\n" + "umull v13.8h, v3.8b, %[b1].8b\n" + "uadalp v26.4s, v14.8h\n" + "umull v14.8h, v3.8b, %[b2].8b\n" + "uadalp v27.4s, v15.8h\n" + "umull v15.8h, v3.8b, %[b3].8b\n" + + "uadalp v28.4s, v12.8h\n" + "umull2 v12.8h, v3.16b, %[b0].16b\n" + "ldr %q[b0], [%[b_ptr]]\n" + "uadalp v29.4s, v13.8h\n" + "umull2 v13.8h, v3.16b, %[b1].16b\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "uadalp v30.4s, v14.8h\n" + "umull2 v14.8h, v3.16b, %[b2].16b\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "uadalp v31.4s, v15.8h\n" + "umull2 v15.8h, v3.16b, %[b3].16b\n" + "ldr %q[b3], [%[b_ptr], #48]\n" + + "uadalp v28.4s, v12.8h\n" + "umull v12.8h, v0.8b, %[b0].8b\n" + "add %[b_ptr], %[b_ptr], #64\n" + "uadalp v29.4s, v13.8h\n" + "umull v13.8h, v0.8b, %[b1].8b\n" + "ldr q3, [%[a_ptr], #48]\n" + "uadalp v30.4s, v14.8h\n" + "umull v14.8h, v0.8b, %[b2].8b\n" + "add %[a_ptr], %[a_ptr], #64\n" + "uadalp v31.4s, v15.8h\n" + "umull v15.8h, v0.8b, %[b3].8b\n" + "bne 1b\n" + + // Branch target + "2:\n" + "uadalp v16.4s, v12.8h\n" + "umull2 v12.8h, v0.16b, %[b0].16b\n" + "uadalp v17.4s, v13.8h\n" + "umull2 v13.8h, v0.16b, %[b1].16b\n" + "uadalp v18.4s, v14.8h\n" + "umull2 v14.8h, v0.16b, %[b2].16b\n" + "uadalp v19.4s, v15.8h\n" + "umull2 v15.8h, v0.16b, %[b3].16b\n" + + "uadalp v16.4s, v12.8h\n" + "umull v12.8h, v1.8b, %[b0].8b\n" + "uadalp v17.4s, v13.8h\n" + "umull v13.8h, v1.8b, %[b1].8b\n" + "uadalp v18.4s, v14.8h\n" + "umull v14.8h, v1.8b, %[b2].8b\n" + "uadalp v19.4s, v15.8h\n" + "umull v15.8h, v1.8b, %[b3].8b\n" + + "uadalp v20.4s, v12.8h\n" + "umull2 v12.8h, v1.16b, %[b0].16b\n" + "uadalp v21.4s, v13.8h\n" + "umull2 v13.8h, v1.16b, %[b1].16b\n" + "uadalp v22.4s, v14.8h\n" + "umull2 v14.8h, v1.16b, %[b2].16b\n" + "uadalp v23.4s, v15.8h\n" + "umull2 v15.8h, v1.16b, %[b3].16b\n" + + "uadalp v20.4s, v12.8h\n" + "umull v12.8h, v2.8b, %[b0].8b\n" + "uadalp v21.4s, v13.8h\n" + "umull v13.8h, v2.8b, %[b1].8b\n" + "uadalp v22.4s, v14.8h\n" + "umull v14.8h, v2.8b, %[b2].8b\n" + "uadalp v23.4s, v15.8h\n" + "umull v15.8h, v2.8b, %[b3].8b\n" + + "uadalp v24.4s, v12.8h\n" + "umull2 v12.8h, v2.16b, %[b0].16b\n" + "uadalp v25.4s, v13.8h\n" + "umull2 v13.8h, v2.16b, %[b1].16b\n" + "uadalp v26.4s, v14.8h\n" + "umull2 v14.8h, v2.16b, %[b2].16b\n" + "uadalp v27.4s, v15.8h\n" + "umull2 v15.8h, v2.16b, %[b3].16b\n" + + "uadalp v24.4s, v12.8h\n" + "umull v12.8h, v3.8b, %[b0].8b\n" + "uadalp v25.4s, v13.8h\n" + "umull v13.8h, v3.8b, %[b1].8b\n" + "uadalp v26.4s, v14.8h\n" + "umull v14.8h, v3.8b, %[b2].8b\n" + "uadalp v27.4s, v15.8h\n" + "umull v15.8h, v3.8b, %[b3].8b\n" + + "uadalp v28.4s, v12.8h\n" + "umull2 v12.8h, v3.16b, %[b0].16b\n" + "uadalp v29.4s, v13.8h\n" + "umull2 v13.8h, v3.16b, %[b1].16b\n" + "uadalp v30.4s, v14.8h\n" + "umull2 v14.8h, v3.16b, %[b2].16b\n" + "uadalp v31.4s, v15.8h\n" + "umull2 v15.8h, v3.16b, %[b3].16b\n" + + "uadalp v28.4s, v12.8h\n" + "uadalp v29.4s, v13.8h\n" + "uadalp v30.4s, v14.8h\n" + "uadalp v31.4s, v15.8h\n" + + "addp v16.4s, v16.4s, v17.4s\n" + "addp v17.4s, v18.4s, v19.4s\n" + "addp v18.4s, v20.4s, v21.4s\n" + "addp v19.4s, v22.4s, v23.4s\n" + "addp v20.4s, v24.4s, v25.4s\n" + "addp v21.4s, v26.4s, v27.4s\n" + "addp v22.4s, v28.4s, v29.4s\n" + "addp v23.4s, v30.4s, v31.4s\n" + + "addp v16.4s, v16.4s, v17.4s\n" + "addp v17.4s, v18.4s, v19.4s\n" + "addp v18.4s, v20.4s, v21.4s\n" + "addp v19.4s, v22.4s, v23.4s\n" + + "str q16, [%[c_ptr]]\n" + "str q17, [%[c_ptr], #16]\n" + "str q18, [%[c_ptr], #32]\n" + "str q19, [%[c_ptr], #48]\n" + "add %[c_ptr], %[c_ptr], #64\n" + + : + [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), + [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [b3] "+w"(b3), + [k] "+r"(k) + : + : "x20", "x21", "v0", "v1", "v2", "v3", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"); + } + } +} + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp new file mode 100644 index 0000000000..77ec59aa35 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) + +#include "arm_gemm.hpp" + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); +void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); + +// 24x8 HGEMM "strategy" class. Describes the kernel properties. +// +// The generic "gemm_opt" function will instantiate one of these (allowing +// the constructor to pick a kernel implementation). +class hgemm_24x8 +{ +public: + typedef __fp16 operand_type; + typedef __fp16 result_type; + + typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); + + static const int A_block = 1; + static const int A_interleave = 8; + static const bool A_transpose = false; + + static const int B_block = 1; + static const int B_interleave = 24; + static const bool B_transpose = true; + + static const int out_width = 24; + static const int out_height = 8; + static const int k_unroll = 1; + + // Default to the generic kernel + kern_type kernel = a64_hgemm_asimd_24x8; + + hgemm_24x8(const CPUInfo *ci) + { + if(ci->get_cpu_model() == CPUModel::A55r1) + { + kernel = a64_hgemm_asimd_24x8_a55r1; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp new file mode 100644 index 0000000000..d59618dd54 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) + +#include + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 12x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm +{ +void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) +{ + const __fp16 *a_ptr = Apanel; + __fp16 *c_ptr = Cpanel; + + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k_iters = ((K + 1) / 2) - 1; + + for(int yb = 0; yb < ablocks; yb++) + { + const __fp16 *a_ptr0 = a_ptr; + const __fp16 *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + int k = k_iters; + a_ptr = a_ptr0; + + // As A55 requires 64-bit loads anyway, just use 64 bits of the + // "A" operands to save on "ins" instructions. Since A55 is + // in-order, two sets of "A" operands and one set of "B" is + // sufficient. + register float16x8_t a0 asm("v0"); + register float16x8_t a1 asm("v1"); + register float16x8_t a0a asm("v2"); + register float16x8_t a1a asm("v3"); + register float16x8_t b0 asm("v4"); + register float16x8_t b1 asm("v5"); + register float16x8_t b2 asm("v6"); + + __asm __volatile( + // Enable FP16 extensions + ".arch armv8.2-a+fp16\n" + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.8h, #0x0\n" + "ldr %d[a0], [%[a_ptr]]\n" + "movi v9.8h, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.8h, #0x0\n" + "ldr %d[a1], [%[a_ptr], #8]\n" + "movi v11.8h, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.8h, #0x0\n" + "movi v13.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") + "movi v14.8h, #0x0\n" + "movi v15.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") + "movi v16.8h, #0x0\n" + "movi v17.8h, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") + "movi v18.8h, #0x0\n" + "movi v19.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") + "movi v20.8h, #0x0\n" + "movi v21.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") + "movi v22.8h, #0x0\n" + "movi v23.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") + "movi v24.8h, #0x0\n" + "movi v25.8h, #0x0\n" + "movi v26.8h, #0x0\n" + "movi v27.8h, #0x0\n" + "movi v28.8h, #0x0\n" + "movi v29.8h, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v31.8h, #0x0\n" + + // The loop is offset by these two instructions which must + // always be executed. + "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + "1:\n" + "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" + "subs %w[k], %w[k], #1\n" + "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" + "ldr %d[a0a], [%[a_ptr], #16]\n" + + "fmla v12.8h, %[b0].8h, %[a1].h[0]\n" + "ins %[b2].d[1], x20\n" + "fmla v13.8h, %[b0].8h, %[a1].h[1]\n" + "fmla v14.8h, %[b0].8h, %[a1].h[2]\n" + "fmla v15.8h, %[b0].8h, %[a1].h[3]\n" + "ldr %d[a1a], [%[a_ptr], #24]\n" + + "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" + "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" + "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" + "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "fmla v20.8h, %[b1].8h, %[a1].h[0]\n" + "fmla v21.8h, %[b1].8h, %[a1].h[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "fmla v22.8h, %[b1].8h, %[a1].h[2]\n" + "fmla v23.8h, %[b1].8h, %[a1].h[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" + "ins %[b0].d[1], x20\n" + "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" + "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCH("[%[a_ptr], #128]") + + "fmla v28.8h, %[b2].8h, %[a1].h[0]\n" + "fmla v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCH("[%[b_ptr], #384]") + "fmla v30.8h, %[b2].8h, %[a1].h[2]\n" + "fmla v31.8h, %[b2].8h, %[a1].h[3]\n" + "ldr %d[b2], [%[b_ptr], #80]\n" + + // Unroll 1 + "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n" + "ins %[b1].d[1], x20\n" + "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n" + "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n" + "ldr %d[a0], [%[a_ptr], #32]\n" + + "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n" + "ins %[b2].d[1], x20\n" + "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n" + "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n" + "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n" + "ldr %d[a1], [%[a_ptr], #40]\n" + + "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n" + "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n" + "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n" + "ldr %d[b0], [%[b_ptr], #96]\n" + + "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n" + "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n" + "ldr x20, [%[b_ptr], #104]\n" + "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n" + "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n" + "ldr %d[b1], [%[b_ptr], #112]\n" + + "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n" + "ins %[b0].d[1], x20\n" + "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n" + "ldr x20, [%[b_ptr], #120]\n" + "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n" + "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n" + + "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n" ASM_PREFETCH("[%[b_ptr], #448]") + "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n" + "ins %[b1].d[1], x20\n" + "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + + "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" + "bne 1b\n" + + "4:\n" + + // Start final iteration - branch off to "odd" code before we load a0a + "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" + "cbnz %w[oddk], 2f\n" + + // Even K continuation + "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" + "ldr %d[a0a], [%[a_ptr], #16]\n" + + "fmla v12.8h, %[b0].8h, %[a1].h[0]\n" + "ins %[b2].d[1], x20\n" + "fmla v13.8h, %[b0].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr]]") + "fmla v14.8h, %[b0].8h, %[a1].h[2]\n" + "fmla v15.8h, %[b0].8h, %[a1].h[3]\n" + "ldr %d[a1a], [%[a_ptr], #24]\n" + + "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" + "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]") + "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" + "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "fmla v20.8h, %[b1].8h, %[a1].h[0]\n" + "fmla v21.8h, %[b1].8h, %[a1].h[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "fmla v22.8h, %[b1].8h, %[a1].h[2]\n" + "fmla v23.8h, %[b1].8h, %[a1].h[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" + "ins %[b0].d[1], x20\n" + "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" + "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]") + + "fmla v28.8h, %[b2].8h, %[a1].h[0]\n" + "fmla v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]") + "fmla v30.8h, %[b2].8h, %[a1].h[2]\n" + "fmla v31.8h, %[b2].8h, %[a1].h[3]\n" + "ldr %d[b2], [%[b_ptr], #80]\n" + + "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n" + "ins %[b1].d[1], x20\n" + "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n" + "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]") + + "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n" + "ins %[b2].d[1], x20\n" + "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]") + "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n" + "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n" + "ldr %d[a1], [%[a_ptr], #40]\n" + + "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]") + "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n" + "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]") + + "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n" + "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") + "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n" + "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") + + "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n" + "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]") + "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n" + "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]") + + "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n" + "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n" + "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n" + "b 3f\n" + + "2:\n" + + // Odd tail + "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr]]") + + "fmla v12.8h, %[b0].8h, %[a1].h[0]\n" + "ins %[b2].d[1], x20\n" + "fmla v13.8h, %[b0].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]") + "fmla v14.8h, %[b0].8h, %[a1].h[2]\n" + "add %[a_ptr], %[a_ptr], #16\n" + "fmla v15.8h, %[b0].8h, %[a1].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]") + + "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]") + "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" + "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]") + + "fmla v20.8h, %[b1].8h, %[a1].h[0]\n" + "fmla v21.8h, %[b1].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]") + "fmla v22.8h, %[b1].8h, %[a1].h[2]\n" + "fmla v23.8h, %[b1].8h, %[a1].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]") + + "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" + "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]") + "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" + "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]") + + "fmla v28.8h, %[b2].8h, %[a1].h[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "fmla v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "fmla v30.8h, %[b2].8h, %[a1].h[2]\n" + ASM_PREFETCHWL2("[%[c_ptr], #640]") "fmla v31.8h, %[b2].8h, %[a1].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]") + + // Common tail + // A55 won't dual issue these stores with anything else, so + // simplest to do them all in this common code. + "3:\n" + "str q8, [%[c_ptr]]\n" + "str q16, [%[c_ptr], #16]\n" + "str q24, [%[c_ptr], #32]\n" + "str q9, [%[c_ptr], #48]\n" + "str q17, [%[c_ptr], #64]\n" + "str q25, [%[c_ptr], #80]\n" + "str q10, [%[c_ptr], #96]\n" + "str q18, [%[c_ptr], #112]\n" + "str q26, [%[c_ptr], #128]\n" + "str q11, [%[c_ptr], #144]\n" + "str q19, [%[c_ptr], #160]\n" + "str q27, [%[c_ptr], #176]\n" + "str q12, [%[c_ptr], #192]\n" + "str q20, [%[c_ptr], #208]\n" + "str q28, [%[c_ptr], #224]\n" + "str q13, [%[c_ptr], #240]\n" + "str q21, [%[c_ptr], #256]\n" + "str q29, [%[c_ptr], #272]\n" + "str q14, [%[c_ptr], #288]\n" + "str q22, [%[c_ptr], #304]\n" + "str q30, [%[c_ptr], #320]\n" + "str q15, [%[c_ptr], #336]\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "5:\n" + "add %[c_ptr], %[c_ptr], #384\n" + : + [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), + [a0] "=w"(a0), [a0a] "=w"(a0a), [a1] "=w"(a1), [a1a] "=w"(a1a), + [b0] "=w"(b0), [b1] "=w"(b1), [b2] "=w"(b2), [k] "+r"(k) + : [oddk] "r"(oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"); + } + } +} + +} // namespace arm_gemm + +#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp new file mode 100644 index 0000000000..468d603484 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) + +#include + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 12x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm +{ +void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) +{ + const __fp16 *a_ptr = Apanel; + __fp16 *c_ptr = Cpanel; + + for(int yb = 0; yb < ablocks; yb++) + { + const __fp16 *a_ptr0 = a_ptr; + const __fp16 *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k = ((K + 1) / 2) - 1; + + register float16x8_t a0 asm("v0"); + register float16x8_t a0a asm("v1"); + register float16x8_t b0 asm("v2"); + register float16x8_t b1 asm("v3"); + register float16x8_t b2 asm("v4"); + register float16x8_t b0a asm("v5"); + register float16x8_t b1a asm("v6"); + register float16x8_t b2a asm("v7"); + + __asm __volatile( + ".arch armv8.2-a+fp16\n" + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.8h, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.8h, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.8h, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v11.8h, #0x0\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "movi v12.8h, #0x0\n" + "ldr %q[b0a], [%[b_ptr], #48]\n" + "movi v13.8h, #0x0\n" + "ldr %q[b1a], [%[b_ptr], #64]\n" + "movi v14.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v15.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v16.8h, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v17.8h, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #192]") "movi v18.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v19.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") + "movi v20.8h, #0x0\n" + "movi v21.8h, #0x0\n" + "movi v22.8h, #0x0\n" + "movi v23.8h, #0x0\n" + "movi v24.8h, #0x0\n" + "movi v25.8h, #0x0\n" + "movi v26.8h, #0x0\n" + "movi v27.8h, #0x0\n" + "movi v28.8h, #0x0\n" + "movi v29.8h, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v31.8h, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + "1:\n" + "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" + "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" + "ldr %q[a0a], [%[a_ptr], #16]\n" + "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" + "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" + "ldr %q[b2a], [%[b_ptr], #80]\n" + "fmla v12.8h, %[b0].8h, %[a0].h[4]\n" + "fmla v13.8h, %[b0].8h, %[a0].h[5]\n" + "fmla v14.8h, %[b0].8h, %[a0].h[6]\n" + "fmla v15.8h, %[b0].8h, %[a0].h[7]\n" + "ldr %q[b0], [%[b_ptr], #96]\n" + + "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" + "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCH("[%[a_ptr], #128]") + "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" + "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v20.8h, %[b1].8h, %[a0].h[4]\n" + "fmla v21.8h, %[b1].8h, %[a0].h[5]\n" + "fmla v22.8h, %[b1].8h, %[a0].h[6]\n" + "fmla v23.8h, %[b1].8h, %[a0].h[7]\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + + "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" + "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" ASM_PREFETCH("[%[b_ptr], #288]") + "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" + "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" + "fmla v28.8h, %[b2].8h, %[a0].h[4]\n" + "fmla v29.8h, %[b2].8h, %[a0].h[5]\n" + "fmla v30.8h, %[b2].8h, %[a0].h[6]\n" + "fmla v31.8h, %[b2].8h, %[a0].h[7]\n" + "ldr %q[a0], [%[a_ptr], #32]\n" + + "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n" + "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n" + "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n" + "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n" + "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n" + "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n" + "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n" + "ldr %q[b0a], [%[b_ptr], #48]\n" + + "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n" + "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n" ASM_PREFETCH("[%[b_ptr], #352]") + "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n" + "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n" + "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n" + "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n" + "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n" + "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n" + "ldr %q[b1a], [%[b_ptr], #64]\n" + + "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n" + "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n" + "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n" + "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n" + "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n" + "subs %w[k], %w[k], #1\n" + "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n" + "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n" + + "bne 1b\n" + "4:\n" + + // Jump to odd tail if necessary. + "cbnz %w[oddk], 2f\n" + + // Even tail. + "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" + "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" + "ldr %q[a0a], [%[a_ptr], #16]\n" + "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" + "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" + "ldr %q[b2a], [%[b_ptr], #80]\n" + "fmla v12.8h, %[b0].8h, %[a0].h[4]\n" + "fmla v13.8h, %[b0].8h, %[a0].h[5]\n" + "fmla v14.8h, %[b0].8h, %[a0].h[6]\n" + "fmla v15.8h, %[b0].8h, %[a0].h[7]\n" + + "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" + "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" + "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" + "fmla v20.8h, %[b1].8h, %[a0].h[4]\n" + "fmla v21.8h, %[b1].8h, %[a0].h[5]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "fmla v22.8h, %[b1].8h, %[a0].h[6]\n" + "fmla v23.8h, %[b1].8h, %[a0].h[7]\n" + + "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" + "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" + "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" + "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" + "fmla v28.8h, %[b2].8h, %[a0].h[4]\n" + "fmla v29.8h, %[b2].8h, %[a0].h[5]\n" + "fmla v30.8h, %[b2].8h, %[a0].h[6]\n" + "fmla v31.8h, %[b2].8h, %[a0].h[7]\n" + + "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n" + "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n" + "str q8, [%[c_ptr]]\n" + "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n" + "str q16, [%[c_ptr], #16]\n" + + "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n" + "str q24, [%[c_ptr], #32]\n" + "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n" + "str q9, [%[c_ptr], #48]\n" + "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n" + "str q17, [%[c_ptr], #64]\n" + + "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n" + "str q25, [%[c_ptr], #80]\n" + "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n" + "str q10, [%[c_ptr], #96]\n" + "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n" + "str q18, [%[c_ptr], #112]\n" + + "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n" + "str q26, [%[c_ptr], #128]\n" + "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n" + "str q11, [%[c_ptr], #144]\n" + "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n" + "str q19, [%[c_ptr], #160]\n" + + "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n" + "str q27, [%[c_ptr], #176]\n" + "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n" + "str q12, [%[c_ptr], #192]\n" + "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n" + "str q20, [%[c_ptr], #208]\n" + + "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n" + "str q28, [%[c_ptr], #224]\n" + "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n" + "str q13, [%[c_ptr], #240]\n" + "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n" + "str q21, [%[c_ptr], #256]\n" + + "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n" + "str q29, [%[c_ptr], #272]\n" + "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n" + "str q14, [%[c_ptr], #288]\n" + "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n" + "str q22, [%[c_ptr], #304]\n" + + "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n" + "str q30, [%[c_ptr], #320]\n" + "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n" + "str q15, [%[c_ptr], #336]\n" + "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n" + "b 3f\n" + + // Odd tail + "2:\n" + "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" + "add %[a_ptr], %[a_ptr], #16\n" + "str q8, [%[c_ptr]]\n" + "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" + "str q16, [%[c_ptr], #16]\n" + + "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" + "str q24, [%[c_ptr], #32]\n" + "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" + "str q9, [%[c_ptr], #48]\n" + "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" + "str q17, [%[c_ptr], #64]\n" + + "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" + "str q25, [%[c_ptr], #80]\n" + "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" + "str q10, [%[c_ptr], #96]\n" + "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" + "str q18, [%[c_ptr], #112]\n" + + "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" + "str q26, [%[c_ptr], #128]\n" + "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" + "str q11, [%[c_ptr], #144]\n" + "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" + "str q19, [%[c_ptr], #160]\n" + + "fmla v12.8h, %[b0].8h, %[a0].h[4]\n" + "str q27, [%[c_ptr], #176]\n" + "fmla v20.8h, %[b1].8h, %[a0].h[4]\n" + "str q12, [%[c_ptr], #192]\n" + "fmla v28.8h, %[b2].8h, %[a0].h[4]\n" + "str q20, [%[c_ptr], #208]\n" + + "fmla v13.8h, %[b0].8h, %[a0].h[5]\n" + "str q28, [%[c_ptr], #224]\n" + "fmla v21.8h, %[b1].8h, %[a0].h[5]\n" + "str q13, [%[c_ptr], #240]\n" + "fmla v29.8h, %[b2].8h, %[a0].h[5]\n" + "str q21, [%[c_ptr], #256]\n" + + "fmla v14.8h, %[b0].8h, %[a0].h[6]\n" + "str q29, [%[c_ptr], #272]\n" + "fmla v22.8h, %[b1].8h, %[a0].h[6]\n" + "str q14, [%[c_ptr], #288]\n" + "fmla v30.8h, %[b2].8h, %[a0].h[6]\n" + "str q22, [%[c_ptr], #304]\n" + + "fmla v15.8h, %[b0].8h, %[a0].h[7]\n" + "str q30, [%[c_ptr], #320]\n" + "fmla v23.8h, %[b1].8h, %[a0].h[7]\n" + "str q15, [%[c_ptr], #336]\n" + "fmla v31.8h, %[b2].8h, %[a0].h[7]\n" + + "3:\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + : + [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), + [a0] "+w"(a0), [a0a] "+w"(a0a), + [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k), + [b0a] "+w"(b0a), [b1a] "+w"(b1a), [b2a] "+w"(b2a) + : [oddk] "r"(oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"); + } + } +} + +} // namespace arm_gemm + +#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp new file mode 100644 index 0000000000..91a9e8de60 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_sgemm_asimd_12x8(const float *, const float *, float *, int, int, int); +void a64_sgemm_asimd_12x8_a53(const float *, const float *, float *, int, int, int); +void a64_sgemm_asimd_12x8_a55(const float *, const float *, float *, int, int, int); +void a64_sgemm_asimd_12x8_a55r1(const float *, const float *, float *, int, int, int); + +// 12x8 SGEMM "strategy" class. +// +// This describes the characteristics of a family of kernels, in terms of +// the required interleave properties and the output block size. +// +// All kernels in the family must share these characteristics. The actual +// kernel to be used can be chosen at runtime, based on the CPU_type +// structure. +class sgemm_12x8 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, const float *, float *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 8; + static const int A_block = 1; + static const int A_transpose = 0; + + /* Same for B input */ + static const int B_interleave = 12; + static const int B_block = 1; + static const int B_transpose = 1; + + /* Kernel blocking parameters */ + static const int out_width = 12; + static const int out_height = 8; + static const int k_unroll = 1; + + kern_type kernel = a64_sgemm_asimd_12x8; + + sgemm_12x8(const CPUInfo *ci) + { + // Select specific kernel if available + switch(ci->get_cpu_model()) + { + case CPUModel::A53: + kernel = a64_sgemm_asimd_12x8_a53; + break; + + case CPUModel::A55r0: + kernel = a64_sgemm_asimd_12x8_a55; + break; + + case CPUModel::A55r1: + kernel = a64_sgemm_asimd_12x8_a55r1; + break; + + default: + /* Generic kernel is initialized by default. */ + break; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp new file mode 100644 index 0000000000..618ebc733c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +namespace arm_gemm +{ +void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) +{ + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for(int yb = 0; yb < ablocks; yb++) + { + const float *a_ptr0 = a_ptr; + const float *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k = ((K + 1) / 2) - 1; + + register float32x4_t a0 asm("v0"); + register float32x4_t a1 asm("v1"); + register float32x4_t b0 asm("v2"); + register float32x4_t b1 asm("v3"); + register float32x4_t b2 asm("v4"); + register float32x4_t a0a asm("v5"); + register float32x4_t a1a asm("v6"); + + __asm __volatile( + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #384]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + "1:\n" + // Unroll 0 + "ldr %d[b2], [%[b_ptr], #32]\n" + "nop\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "subs %w[k], %w[k], #1\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + + "ldr %d[a0a], [%[a_ptr], #32]\n" + "ins %[b2].d[1], x20\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "ldr x20, [%[a_ptr], #40]\n" + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + + "ldr %d[a1a], [%[a_ptr], #48]\n" + "ins %[a0a].d[1], x20\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "ldr x20, [%[a_ptr], #56]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + + "ldr %d[b0], [%[b_ptr], #48]\n" + "ins %[a1a].d[1], x20\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + + ASM_PREFETCH("[%[a_ptr], #320]") + "ins %[b0].d[1], x20\n" + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + + ASM_PREFETCH("[%[b_ptr], #448]") + "nop\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + + "ldr %d[b1], [%[b_ptr], #64]\n" + "nop\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "ldr x20, [%[b_ptr], #72]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + + ASM_PREFETCH("[%[b_ptr], #512]") + "ins %[b1].d[1], x20\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + + // Unroll 1 + "ldr %d[b2], [%[b_ptr], #80]\n" + "nop\n" + "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" + "ldr x20, [%[b_ptr], #88]\n" + "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" + "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" + + "ldr %d[a0], [%[a_ptr], #64]\n" + "ins %[b2].d[1], x20\n" + "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" + "ldr x20, [%[a_ptr], #72]\n" + "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" + + "ldr %d[a1], [%[a_ptr], #80]\n" + "ins %[a0].d[1], x20\n" + "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" + "ldr x20, [%[a_ptr], #88]\n" + "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" + "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" + + "ldr %d[b0], [%[b_ptr], #96]\n" + "ins %[a1].d[1], x20\n" + "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" + "ldr x20, [%[b_ptr], #104]\n" + "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" + + "nop\n" + "ins %[b0].d[1], x20\n" + "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" + + "nop\n" + "nop\n" + "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" + "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" + + "ldr %d[b1], [%[b_ptr], #112]\n" + "nop\n" + "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" + "ldr x20, [%[b_ptr], #120]\n" + "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" + "add %[b_ptr], %[b_ptr], #96\n" + + "nop\n" + "ins %[b1].d[1], x20\n" + "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" + "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" + + "bne 1b\n" + + // Branch here if K=1 or 2. Do the right thing for odd/even at the end. + "4:\n" + "cbnz %w[oddk], 2f\n" + + // Detached final iteration. (even K) + "ldr %d[b2], [%[b_ptr], #32]\n" + "nop\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "subs %w[k], %w[k], #1\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + + "ldr %d[a0a], [%[a_ptr], #32]\n" + "ins %[b2].d[1], x20\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "ldr x20, [%[a_ptr], #40]\n" + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + + "ldr %d[a1a], [%[a_ptr], #48]\n" + "ins %[a0a].d[1], x20\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "ldr x20, [%[a_ptr], #56]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + + "ldr %d[b0], [%[b_ptr], #48]\n" + "ins %[a1a].d[1], x20\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + + "ins %[b0].d[1], x20\n" + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + + "nop\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + + "ldr %d[b1], [%[b_ptr], #64]\n" + "nop\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "ldr x20, [%[b_ptr], #72]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + + "ins %[b1].d[1], x20\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + + "ldr %d[b2], [%[b_ptr], #80]\n" + "nop\n" + "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" + "ldr x20, [%[b_ptr], #88]\n" + "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" + "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" + + "ins %[b2].d[1], x20\n" + "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" + "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" + "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" + "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" + "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" + "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" + "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" + "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" + "b 3f\n" + + // Detached final iteration. (odd K) + "2:\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + "nop\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + + "ins %[b2].d[1], x20\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + + // Common tail + "3:\n" + "str q8, [%[c_ptr]]\n" + "str q16, [%[c_ptr], #16]\n" + "str q24, [%[c_ptr], #32]\n" + "str q9, [%[c_ptr], #48]\n" + "str q17, [%[c_ptr], #64]\n" + "str q25, [%[c_ptr], #80]\n" + "str q10, [%[c_ptr], #96]\n" + "str q18, [%[c_ptr], #112]\n" + "str q26, [%[c_ptr], #128]\n" + "str q11, [%[c_ptr], #144]\n" + "str q19, [%[c_ptr], #160]\n" + "str q27, [%[c_ptr], #176]\n" + "str q12, [%[c_ptr], #192]\n" + "str q20, [%[c_ptr], #208]\n" + "str q28, [%[c_ptr], #224]\n" + "str q13, [%[c_ptr], #240]\n" + "str q21, [%[c_ptr], #256]\n" + "str q29, [%[c_ptr], #272]\n" + "str q14, [%[c_ptr], #288]\n" + "str q22, [%[c_ptr], #304]\n" + "str q30, [%[c_ptr], #320]\n" + "str q15, [%[c_ptr], #336]\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + : + [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), + [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a), + [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k) + : [oddk] "r"(oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"); + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp new file mode 100644 index 0000000000..4ca25eb5ba --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +namespace arm_gemm +{ +void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) +{ + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for(int yb = 0; yb < ablocks; yb++) + { + const float *a_ptr0 = a_ptr; + const float *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k = ((K + 1) / 2) - 1; + + register float32x4_t a0 asm("v0"); + register float32x4_t a1 asm("v1"); + register float32x4_t b0 asm("v2"); + register float32x4_t b1 asm("v3"); + register float32x4_t b2 asm("v4"); + register float32x4_t a0a asm("v5"); + register float32x4_t a1a asm("v6"); + + __asm __volatile( + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #384]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + "1:\n" + // Unroll 0 + "ldr %d[b2], [%[b_ptr], #32]\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "subs %w[k], %w[k], #1\n" + + "ldr %d[a0a], [%[a_ptr], #32]\n" + "ins %[b2].d[1], x20\n" + + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + + "ldr %d[a1a], [%[a_ptr], #48]\n" + "ins %[a0a].d[1], x20\n" + + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + + "ldr %d[b0], [%[b_ptr], #48]\n" + "ins %[a1a].d[1], x20\n" ASM_PREFETCH("[%[a_ptr], #320]") + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + + "ldr %d[b1], [%[b_ptr], #64]\n" + "ins %[b0].d[1], x20\n" + + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCH("[%[b_ptr], #448]") + + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" ASM_PREFETCH("[%[b_ptr], #512]") + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + + // Unroll 1 + "ldr %d[b2], [%[b_ptr], #80]\n" + "ins %[b1].d[1], x20\n" + + "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" + "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" + + "ldr %d[a0], [%[a_ptr], #64]\n" + "ins %[b2].d[1], x20\n" + + "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" + "ldr x20, [%[a_ptr], #72]\n" + "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" + + "ldr %d[a1], [%[a_ptr], #80]\n" + "ins %[a0].d[1], x20\n" + + "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" + "ldr x20, [%[a_ptr], #88]\n" + "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" + + "ldr %d[b0], [%[b_ptr], #96]\n" + "ins %[a1].d[1], x20\n" + + "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" + "ldr x20, [%[b_ptr], #104]\n" + "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" + + "ldr %d[b1], [%[b_ptr], #112]\n" + "ins %[b0].d[1], x20\n" + + "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" + "ldr x20, [%[b_ptr], #120]\n" + "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" + "add %[a_ptr], %[a_ptr], #64\n" + + "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" + "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" + + "ldr %d[b2], [%[b_ptr], #32]\n" + "ins %[b1].d[1], x20\n" + + "bne 1b\n" + + // Branch here if K=1 or 2. Do the right thing for odd/even at the end. + "4:\n" + "cbnz %w[oddk], 2f\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + + // Detached final iteration. (even K) + "ldr x20, [%[b_ptr], #40]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "subs %w[k], %w[k], #1\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + + "ldr %d[a0a], [%[a_ptr], #32]\n" + "ins %[b2].d[1], x20\n" + + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + + "ldr %d[a1a], [%[a_ptr], #48]\n" + "ins %[a0a].d[1], x20\n" + + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + + "ldr %d[b0], [%[b_ptr], #48]\n" + "ins %[a1a].d[1], x20\n" + + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + + "ldr %d[b1], [%[b_ptr], #64]\n" + "ins %[b0].d[1], x20\n" + + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + + "ldr %d[b2], [%[b_ptr], #80]\n" + "ins %[b1].d[1], x20\n" + + "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" + "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" + + "ins %[b2].d[1], x20\n" + "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" + "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" + "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" + "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" + "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" + "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" + "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" + "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" + "b 3f\n" + + // Detached final iteration. (odd K) + "2:\n" + + "ldr %d[b2], [%[b_ptr], #32]\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "ins %[b2].d[1], x20\n" + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + + // Common tail + "3:\n" + "str q8, [%[c_ptr]]\n" + "str q16, [%[c_ptr], #16]\n" + "str q24, [%[c_ptr], #32]\n" + "str q9, [%[c_ptr], #48]\n" + "str q17, [%[c_ptr], #64]\n" + "str q25, [%[c_ptr], #80]\n" + "str q10, [%[c_ptr], #96]\n" + "str q18, [%[c_ptr], #112]\n" + "str q26, [%[c_ptr], #128]\n" + "str q11, [%[c_ptr], #144]\n" + "str q19, [%[c_ptr], #160]\n" + "str q27, [%[c_ptr], #176]\n" + "str q12, [%[c_ptr], #192]\n" + "str q20, [%[c_ptr], #208]\n" + "str q28, [%[c_ptr], #224]\n" + "str q13, [%[c_ptr], #240]\n" + "str q21, [%[c_ptr], #256]\n" + "str q29, [%[c_ptr], #272]\n" + "str q14, [%[c_ptr], #288]\n" + "str q22, [%[c_ptr], #304]\n" + "str q30, [%[c_ptr], #320]\n" + "str q15, [%[c_ptr], #336]\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + : + [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), + [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a), + [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k) + : [oddk] "r"(oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"); + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp new file mode 100644 index 0000000000..89fe6ac7ea --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +namespace arm_gemm +{ +void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K) +{ + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k_iters = ((K + 1) / 2) - 1; + + for(int yb = 0; yb < ablocks; yb++) + { + const float *a_ptr0 = a_ptr; + const float *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + int k = k_iters; + + register float32x4_t a0 asm("v0"); + register float32x4_t a1 asm("v1"); + register float32x4_t b0 asm("v2"); + register float32x4_t b1 asm("v3"); + register float32x4_t b2 asm("v4"); + register float32x4_t a0a asm("v5"); + register float32x4_t a1a asm("v6"); + + __asm __volatile( + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]") + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]") + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]") + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]") + + // The loop is offset by these two instructions which must + // always be executed. + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + "1:\n" + // Unroll 0 + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "subs %w[k], %w[k], #1\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "ldr %d[a0a], [%[a_ptr], #32]\n" + + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "ins %[b2].d[1], x20\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "ldr %d[a1a], [%[a_ptr], #48]\n" + + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "ins %[a0a].d[1], x20\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "ins %[a1a].d[1], x20\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "ins %[b0].d[1], x20\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCH("[%[a_ptr], #448]") + + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" ASM_PREFETCH("[%[b_ptr], #576]") + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + + // Unroll 1 + "ldr %d[b2], [%[b_ptr], #80]\n" + + "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" + "ins %[b1].d[1], x20\n" + "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" + "ldr %d[a0], [%[a_ptr], #64]\n" + + "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" + "ins %[b2].d[1], x20\n" + "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" + "ldr x20, [%[a_ptr], #72]\n" + "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" + "ldr %d[a1], [%[a_ptr], #80]\n" + + "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" + "ins %[a0].d[1], x20\n" + "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" + "ldr x20, [%[a_ptr], #88]\n" + "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" + "ldr %d[b0], [%[b_ptr], #96]\n" + + "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" + "ins %[a1].d[1], x20\n" + "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" + "ldr x20, [%[b_ptr], #104]\n" + "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" + "ldr %d[b1], [%[b_ptr], #112]\n" + + "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" + "ins %[b0].d[1], x20\n" + "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" + "ldr x20, [%[b_ptr], #120]\n" + "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" + + "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" + "add %[a_ptr], %[a_ptr], #64\n" + + "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" ASM_PREFETCH("[%[b_ptr], #640]") + "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" + "ins %[b1].d[1], x20\n" + "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "b.ne 1b\n" + + // Branch here if K=1 or 2. Do the right thing for odd/even at the end. + "4:\n" + + // Start final iteration - branch off to "odd" code before we load a0a. + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "cbnz %w[oddk], 2f\n" + + // Even K continuation + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "ldr %d[a0a], [%[a_ptr], #32]\n" + + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "ins %[b2].d[1], x20\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr]]") + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "ldr %d[a1a], [%[a_ptr], #48]\n" + + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "ins %[a0a].d[1], x20\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "ins %[a1a].d[1], x20\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]") + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]") + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "ins %[b0].d[1], x20\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]") + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + "ldr %d[b2], [%[b_ptr], #80]\n" + + "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" + "ins %[b1].d[1], x20\n" + "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" + "ins %[b2].d[1], x20\n" + + "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]") + "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" + "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]") + "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" + "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]") + "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" + "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]") + "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") + "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") + "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" + "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]") + "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" + "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]") + "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" + "b 3f\n" + + // Odd K continuation + "2:\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" ASM_PREFETCHW("[%[c_ptr]]") + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "ins %[b2].d[1], x20\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]") + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]") + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]") + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]") + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]") + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]") + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]") + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + ASM_PREFETCHWL2("[%[c_ptr], #640]") "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]") + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + + // Common tail + "3:\n" + "str q8, [%[c_ptr]]\n" + "str q16, [%[c_ptr], #16]\n" + "str q24, [%[c_ptr], #32]\n" + "str q9, [%[c_ptr], #48]\n" + "str q17, [%[c_ptr], #64]\n" + "str q25, [%[c_ptr], #80]\n" + "str q10, [%[c_ptr], #96]\n" + "str q18, [%[c_ptr], #112]\n" + "str q26, [%[c_ptr], #128]\n" + "str q11, [%[c_ptr], #144]\n" + "str q19, [%[c_ptr], #160]\n" + "str q27, [%[c_ptr], #176]\n" + "str q12, [%[c_ptr], #192]\n" + "str q20, [%[c_ptr], #208]\n" + "str q28, [%[c_ptr], #224]\n" + "str q13, [%[c_ptr], #240]\n" + "str q21, [%[c_ptr], #256]\n" + "str q29, [%[c_ptr], #272]\n" + "str q14, [%[c_ptr], #288]\n" + "str q22, [%[c_ptr], #304]\n" + "str q30, [%[c_ptr], #320]\n" + "str q15, [%[c_ptr], #336]\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + : + [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), + [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a), + [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k) + : [oddk] "r"(oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"); + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp new file mode 100644 index 0000000000..42e870e814 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 12x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm +{ +void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump = 0, long int block_jump = 0) +{ + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for(int yb = 0; yb < ablocks; yb++) + { + const float *a_ptr0 = a_ptr; + const float *b_ptr = Bpanel; + + for(int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k = ((K + 1) / 2) - 1; + + register float32x4_t a0 asm("v0"); + register float32x4_t a1 asm("v1"); + register float32x4_t b0 asm("v2"); + register float32x4_t b1 asm("v3"); + register float32x4_t b2 asm("v4"); + register float32x4_t a0a asm("v5"); + register float32x4_t a1a asm("v6"); + + __asm __volatile( + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #384]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + // Loop proper + "1:\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "add %[b_ptr], %[b_ptr], %[row_jump]\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "ldr %q[a0a], [%[a_ptr], #32]\n" + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "ldr %q[a1a], [%[a_ptr], #48]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" ASM_PREFETCH("[%[a_ptr], #320]") + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCH("[%[b_ptr], #448]") + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + + "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" + "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" + "ldr %q[a0], [%[a_ptr], #64]\n" + "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" + "add %[b_ptr], %[b_ptr], %[row_jump]\n" + "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" + "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" + "ldr %q[a1], [%[a_ptr], #80]\n" + "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" + "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" + "ldr %q[b0], [%[b_ptr], #96]\n" + + "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" ASM_PREFETCH("[%[b_ptr], #512]") + "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" + "ldr %q[b1], [%[b_ptr], #112]\n" + + "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" + "subs %w[k], %w[k], #1\n" + "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" + "bne 1b\n" + + // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) + "4:\n" + + // Branch to alternative tail for odd K + "cbnz %w[oddk], 2f\n" + + // Detached final iteration (even K) + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "add %[b_ptr], %[b_ptr], %[row_jump]\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "ldr %q[a0a], [%[a_ptr], #32]\n" + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "ldr %q[a1a], [%[a_ptr], #48]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + + "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" + "add %[b_ptr], %[b_ptr], %[block_jump]\n" + "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" + "add %[b_ptr], %[b_ptr], %[row_jump]\n" + "str q8, [%[c_ptr], #0]\n" + "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" + "str q16, [%[c_ptr], #16]\n" + "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" + "str q24, [%[c_ptr], #32]\n" + + "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" + "str q9, [%[c_ptr], #48]\n" + "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" + "str q17, [%[c_ptr], #64]\n" + "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" + "str q25, [%[c_ptr], #80]\n" + "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" + "str q18, [%[c_ptr], #112]\n" + "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" + "str q26, [%[c_ptr], #128]\n" + "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" + "str q19, [%[c_ptr], #160]\n" + "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" + "str q27, [%[c_ptr], #176]\n" + "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" + "str q20, [%[c_ptr], #208]\n" + "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" + "str q28, [%[c_ptr], #224]\n" + "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" + "str q21, [%[c_ptr], #256]\n" + "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" + "str q29, [%[c_ptr], #272]\n" + "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" + "str q22, [%[c_ptr], #304]\n" + "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" + "str q30, [%[c_ptr], #320]\n" + "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" + "str q15, [%[c_ptr], #336]\n" + + "b 3f\n" + + // Detached final iteration (odd K) + "2:\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "add %[b_ptr], %[b_ptr], %[row_jump]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "str q8, [%[c_ptr], #0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "str q16, [%[c_ptr], #16]\n" + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "add %[a_ptr], %[a_ptr], #32\n" + "str q24, [%[c_ptr], #32]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "str q9, [%[c_ptr], #48]\n" + + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "str q17, [%[c_ptr], #64]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "str q25, [%[c_ptr], #80]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "str q18, [%[c_ptr], #112]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "str q26, [%[c_ptr], #128]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "str q19, [%[c_ptr], #160]\n" + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "str q27, [%[c_ptr], #176]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "str q20, [%[c_ptr], #208]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "str q28, [%[c_ptr], #224]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "str q21, [%[c_ptr], #256]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "str q29, [%[c_ptr], #272]\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "str q22, [%[c_ptr], #304]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "str q30, [%[c_ptr], #320]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + "str q15, [%[c_ptr], #336]\n" + + // Common tail + "3:\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + : + [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), + [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a), + [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k) + : [oddk] "r"(oddk), [row_jump] "r"(row_jump), [block_jump] "r"(block_jump) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"); + } + } +} + +void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) +{ + a64_sgemm_asimd_12x8_jumps(Apanel, Bpanel, Cpanel, ablocks, bblocks, K, 0, 0); +} + +} // namespace arm_gemm + +#endif \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp new file mode 100644 index 0000000000..eceacc9031 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_sgemm_native_16x4(const float *, int, const float *, int, float *, int, float, int, int, int); + +// 12x8 SGEMM "strategy" class. +// +// This describes the characteristics of a family of kernels, in terms of +// the required interleave properties and the output block size. +// +// All kernels in the family must share these characteristics. The actual +// kernel to be used can be chosen at runtime, based on the CPU_type +// structure. +class sgemm_native_16x4 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, int, const float *, int, float *, int, float, int, int, int); + + /* Kernel blocking parameters */ + static const int out_width = 16; + static const int out_height = 4; + static const int k_unroll = 1; + + // Default to the generic kernel + kern_type kernel = a64_sgemm_native_16x4; + + sgemm_native_16x4(const CPUInfo *ci) + { + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp new file mode 100644 index 0000000000..1b5787ce7c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp @@ -0,0 +1,734 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include + +namespace arm_gemm +{ +void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) +{ + int oddk = (K % 8) ? 1 : 0; + int beta0 = (beta == 0.0f) ? 1 : 0; + + /* For now, very naive with no blocking */ + for(int y = 0; y < M; y += 4) + { + for(int x0 = 0; x0 < N; x0 += 16) + { + const float *a_ptr0 = A + (y * lda); + const float *a_ptr1 = a_ptr0 + lda; + const float *a_ptr2 = a_ptr1 + lda; + const float *a_ptr3 = a_ptr2 + lda; + + const float *b_ptr = B + x0; + + float *c_ptr0 = C + (y * ldc) + x0; + float *c_ptr1 = c_ptr0 + ldc; + float *c_ptr2 = c_ptr1 + ldc; + float *c_ptr3 = c_ptr2 + ldc; + + int loops = ((K + 4) / 8) - 1; + + size_t ldbb = ldb * sizeof(float); + + __asm __volatile( + "a0 .req v0\n" + "a1 .req v1\n" + "a2 .req v2\n" + "a3 .req v3\n" + "a0a .req v4\n" + "a1a .req v5\n" + "a2a .req v6\n" + "a3a .req v7\n" + "bb0 .req v8\n" + "bb1 .req v9\n" + "bb2 .req v10\n" + "bb3 .req v11\n" + "b0a .req v12\n" + "b1a .req v13\n" + "b2a .req v14\n" + "b3a .req v15\n" + + "a0q .req q0\n" + "a1q .req q1\n" + "a2q .req q2\n" + "a3q .req q3\n" + "a0aq .req q4\n" + "a1aq .req q5\n" + "a2aq .req q6\n" + "a3aq .req q7\n" + "b0q .req q8\n" + "b1q .req q9\n" + "b2q .req q10\n" + "b3q .req q11\n" + "b0aq .req q12\n" + "b1aq .req q13\n" + "b2aq .req q14\n" + "b3aq .req q15\n" + + "movi v16.4s, #0x0\n" + "ldr a0q, [%[a_ptr0]]\n" + "movi v17.4s, #0x0\n" + "ldr b0q, [%[b_ptr]]\n" + "movi v18.4s, #0x0\n" + "ldr b1q, [%[b_ptr], #16]\n" + "movi v19.4s, #0x0\n" + "ldr b2q, [%[b_ptr], #32]\n" + "movi v20.4s, #0x0\n" + "ldr b3q, [%[b_ptr], #48]\n" + "movi v21.4s, #0x0\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "ldr a1q, [%[a_ptr1]]\n" + "movi v22.4s, #0x0\n" + "ldr a2q, [%[a_ptr2]]\n" + "movi v23.4s, #0x0\n" + "ldr a3q, [%[a_ptr3]]\n" + "movi v24.4s, #0x0\n" + "ldr b0aq, [%[b_ptr]]\n" + "movi v25.4s, #0x0\n" + "ldr b1aq, [%[b_ptr], #16]\n" + "movi v26.4s, #0x0\n" + "ldr b2aq, [%[b_ptr], #32]\n" + "cbz %w[beta0], 5f\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip if no complete loops. + "cbz %w[loops], 4f\n" + "b 1f\n" + + // If beta is non-zero, need to load and multiply by beta + "5:\n" + "ld1r {v4.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #16]\n" + "ldr q18, [%[c_ptr0], #32]\n" + "ldr q19, [%[c_ptr0], #48]\n" + + "ldr q20, [%[c_ptr1]]\n" + "fmul v16.4s, v16.4s, v4.4s\n" + "ldr q21, [%[c_ptr1], #16]\n" + "fmul v17.4s, v17.4s, v4.4s\n" + "ldr q22, [%[c_ptr1], #32]\n" + "fmul v18.4s, v18.4s, v4.4s\n" + "ldr q23, [%[c_ptr1], #48]\n" + "fmul v19.4s, v19.4s, v4.4s\n" + + "ldr q24, [%[c_ptr2]]\n" + "fmul v20.4s, v20.4s, v4.4s\n" + "ldr q25, [%[c_ptr2], #16]\n" + "fmul v21.4s, v21.4s, v4.4s\n" + "ldr q26, [%[c_ptr2], #32]\n" + "fmul v22.4s, v22.4s, v4.4s\n" + "ldr q27, [%[c_ptr2], #48]\n" + "fmul v23.4s, v23.4s, v4.4s\n" + + "ldr q28, [%[c_ptr3]]\n" + "fmul v24.4s, v24.4s, v4.4s\n" + "ldr q29, [%[c_ptr3], #16]\n" + "fmul v25.4s, v25.4s, v4.4s\n" + "ldr q30, [%[c_ptr3], #32]\n" + "fmul v26.4s, v26.4s, v4.4s\n" + "ldr q31, [%[c_ptr3], #48]\n" + "fmul v27.4s, v27.4s, v4.4s\n" + + "fmul v28.4s, v28.4s, v4.4s\n" + "fmul v29.4s, v29.4s, v4.4s\n" + "fmul v30.4s, v30.4s, v4.4s\n" + "fmul v31.4s, v31.4s, v4.4s\n" + + "cbz %w[loops], 4f\n" + + "1:\n" + // Unroll 0 + "fmla v16.4s, bb0.4s, a0.s[0]\n" + "fmla v20.4s, bb0.4s, a1.s[0]\n" + "ldr b3aq, [%[b_ptr], #48]\n" + "fmla v24.4s, bb0.4s, a2.s[0]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v28.4s, bb0.4s, a3.s[0]\n" + "ldr b0q, [%[b_ptr]]\n" + + "fmla v17.4s, bb1.4s, a0.s[0]\n" + "fmla v21.4s, bb1.4s, a1.s[0]\n" + "ldr a0aq, [%[a_ptr0], #16]\n" + "fmla v25.4s, bb1.4s, a2.s[0]\n" + "fmla v29.4s, bb1.4s, a3.s[0]\n" + "ldr b1q, [%[b_ptr], #16]\n" + + "fmla v18.4s, bb2.4s, a0.s[0]\n" + "fmla v22.4s, bb2.4s, a1.s[0]\n" + "ldr a1aq, [%[a_ptr1], #16]\n" + "fmla v26.4s, bb2.4s, a2.s[0]\n" + "fmla v30.4s, bb2.4s, a3.s[0]\n" + "ldr b2q, [%[b_ptr], #32]\n" + + "fmla v19.4s, bb3.4s, a0.s[0]\n" + "fmla v23.4s, bb3.4s, a1.s[0]\n" + "ldr a2aq, [%[a_ptr2], #16]\n" + "fmla v27.4s, bb3.4s, a2.s[0]\n" + "fmla v31.4s, bb3.4s, a3.s[0]\n" + "ldr b3q, [%[b_ptr], #48]\n" + + // Unroll 1 + "fmla v16.4s, b0a.4s, a0.s[1]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v20.4s, b0a.4s, a1.s[1]\n" + "ldr a3aq, [%[a_ptr3], #16]\n" + "fmla v24.4s, b0a.4s, a2.s[1]\n" + "fmla v28.4s, b0a.4s, a3.s[1]\n" + "ldr b0aq, [%[b_ptr]]\n" + + "fmla v17.4s, b1a.4s, a0.s[1]\n" + "fmla v21.4s, b1a.4s, a1.s[1]\n" + "subs %w[loops], %w[loops], #1\n" + "fmla v25.4s, b1a.4s, a2.s[1]\n" + "fmla v29.4s, b1a.4s, a3.s[1]\n" + "ldr b1aq, [%[b_ptr], #16]\n" + + "fmla v18.4s, b2a.4s, a0.s[1]\n" + "fmla v22.4s, b2a.4s, a1.s[1]\n" + "fmla v26.4s, b2a.4s, a2.s[1]\n" + "fmla v30.4s, b2a.4s, a3.s[1]\n" + "ldr b2aq, [%[b_ptr], #32]\n" + + "fmla v19.4s, b3a.4s, a0.s[1]\n" + "fmla v23.4s, b3a.4s, a1.s[1]\n" + "fmla v27.4s, b3a.4s, a2.s[1]\n" + "fmla v31.4s, b3a.4s, a3.s[1]\n" + "ldr b3aq, [%[b_ptr], #48]\n" + + // Unroll 2 + "fmla v16.4s, bb0.4s, a0.s[2]\n" + "fmla v20.4s, bb0.4s, a1.s[2]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v24.4s, bb0.4s, a2.s[2]\n" + "fmla v28.4s, bb0.4s, a3.s[2]\n" + "ldr b0q, [%[b_ptr]]\n" + + "fmla v17.4s, bb1.4s, a0.s[2]\n" + "add %[a_ptr0], %[a_ptr0], #32\n" + "fmla v21.4s, bb1.4s, a1.s[2]\n" + "add %[a_ptr1], %[a_ptr1], #32\n" + "fmla v25.4s, bb1.4s, a2.s[2]\n" + "add %[a_ptr2], %[a_ptr2], #32\n" + "fmla v29.4s, bb1.4s, a3.s[2]\n" + "ldr b1q, [%[b_ptr], #16]\n" + + "fmla v18.4s, bb2.4s, a0.s[2]\n" + "add %[a_ptr3], %[a_ptr3], #32\n" + "fmla v22.4s, bb2.4s, a1.s[2]\n" + "fmla v26.4s, bb2.4s, a2.s[2]\n" + "fmla v30.4s, bb2.4s, a3.s[2]\n" + "ldr b2q, [%[b_ptr], #32]\n" + + "fmla v19.4s, bb3.4s, a0.s[2]\n" + "fmla v23.4s, bb3.4s, a1.s[2]\n" + "fmla v27.4s, bb3.4s, a2.s[2]\n" + "fmla v31.4s, bb3.4s, a3.s[2]\n" + "ldr b3q, [%[b_ptr], #48]\n" + + // Unroll 3 + "fmla v16.4s, b0a.4s, a0.s[3]\n" + "fmla v20.4s, b0a.4s, a1.s[3]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v24.4s, b0a.4s, a2.s[3]\n" + "fmla v28.4s, b0a.4s, a3.s[3]\n" + "ldr b0aq, [%[b_ptr]]\n" + + "fmla v17.4s, b1a.4s, a0.s[3]\n" + "fmla v21.4s, b1a.4s, a1.s[3]\n" + "fmla v25.4s, b1a.4s, a2.s[3]\n" + "fmla v29.4s, b1a.4s, a3.s[3]\n" + "ldr b1aq, [%[b_ptr], #16]\n" + + "fmla v18.4s, b2a.4s, a0.s[3]\n" + "fmla v22.4s, b2a.4s, a1.s[3]\n" + "fmla v26.4s, b2a.4s, a2.s[3]\n" + "fmla v30.4s, b2a.4s, a3.s[3]\n" + "ldr b2aq, [%[b_ptr], #32]\n" + + "fmla v19.4s, b3a.4s, a0.s[3]\n" + "fmla v23.4s, b3a.4s, a1.s[3]\n" + "ldr a0q, [%[a_ptr0]]\n" + "fmla v27.4s, b3a.4s, a2.s[3]\n" + "fmla v31.4s, b3a.4s, a3.s[3]\n" + "ldr b3aq, [%[b_ptr], #48]\n" + + // Unroll 4 + "fmla v16.4s, bb0.4s, a0a.s[0]\n" + "fmla v20.4s, bb0.4s, a1a.s[0]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v24.4s, bb0.4s, a2a.s[0]\n" + "fmla v28.4s, bb0.4s, a3a.s[0]\n" + "ldr b0q, [%[b_ptr]]\n" + + "fmla v17.4s, bb1.4s, a0a.s[0]\n" + "fmla v21.4s, bb1.4s, a1a.s[0]\n" + "ldr a1q, [%[a_ptr1]]\n" + "fmla v25.4s, bb1.4s, a2a.s[0]\n" + "fmla v29.4s, bb1.4s, a3a.s[0]\n" + "ldr b1q, [%[b_ptr], #16]\n" + + "fmla v18.4s, bb2.4s, a0a.s[0]\n" + "fmla v22.4s, bb2.4s, a1a.s[0]\n" + "ldr a2q, [%[a_ptr2]]\n" + "fmla v26.4s, bb2.4s, a2a.s[0]\n" + "fmla v30.4s, bb2.4s, a3a.s[0]\n" + "ldr b2q, [%[b_ptr], #32]\n" + + "fmla v19.4s, bb3.4s, a0a.s[0]\n" + "fmla v23.4s, bb3.4s, a1a.s[0]\n" + "ldr a3q, [%[a_ptr3]]\n" + "fmla v27.4s, bb3.4s, a2a.s[0]\n" + "fmla v31.4s, bb3.4s, a3a.s[0]\n" + "ldr b3q, [%[b_ptr], #48]\n" + + // Unroll 5 + "fmla v16.4s, b0a.4s, a0a.s[1]\n" + "fmla v20.4s, b0a.4s, a1a.s[1]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v24.4s, b0a.4s, a2a.s[1]\n" + "fmla v28.4s, b0a.4s, a3a.s[1]\n" + "ldr b0aq, [%[b_ptr]]\n" + + "fmla v17.4s, b1a.4s, a0a.s[1]\n" + "fmla v21.4s, b1a.4s, a1a.s[1]\n" + "fmla v25.4s, b1a.4s, a2a.s[1]\n" + "fmla v29.4s, b1a.4s, a3a.s[1]\n" + "ldr b1aq, [%[b_ptr], #16]\n" + + "fmla v18.4s, b2a.4s, a0a.s[1]\n" + "fmla v22.4s, b2a.4s, a1a.s[1]\n" + "fmla v26.4s, b2a.4s, a2a.s[1]\n" + "fmla v30.4s, b2a.4s, a3a.s[1]\n" + "ldr b2aq, [%[b_ptr], #32]\n" + + "fmla v19.4s, b3a.4s, a0a.s[1]\n" + "fmla v23.4s, b3a.4s, a1a.s[1]\n" + "fmla v27.4s, b3a.4s, a2a.s[1]\n" + "fmla v31.4s, b3a.4s, a3a.s[1]\n" + "ldr b3aq, [%[b_ptr], #48]\n" + + // Unroll 6 + "fmla v16.4s, bb0.4s, a0a.s[2]\n" + "fmla v20.4s, bb0.4s, a1a.s[2]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v24.4s, bb0.4s, a2a.s[2]\n" + "fmla v28.4s, bb0.4s, a3a.s[2]\n" + "ldr b0q, [%[b_ptr]]\n" + + "fmla v17.4s, bb1.4s, a0a.s[2]\n" + "fmla v21.4s, bb1.4s, a1a.s[2]\n" + "fmla v25.4s, bb1.4s, a2a.s[2]\n" + "fmla v29.4s, bb1.4s, a3a.s[2]\n" + "ldr b1q, [%[b_ptr], #16]\n" + + "fmla v18.4s, bb2.4s, a0a.s[2]\n" + "fmla v22.4s, bb2.4s, a1a.s[2]\n" + "fmla v26.4s, bb2.4s, a2a.s[2]\n" + "fmla v30.4s, bb2.4s, a3a.s[2]\n" + "ldr b2q, [%[b_ptr], #32]\n" + + "fmla v19.4s, bb3.4s, a0a.s[2]\n" + "fmla v23.4s, bb3.4s, a1a.s[2]\n" + "fmla v27.4s, bb3.4s, a2a.s[2]\n" + "fmla v31.4s, bb3.4s, a3a.s[2]\n" + "ldr b3q, [%[b_ptr], #48]\n" + + // Unroll 7 + "fmla v16.4s, b0a.4s, a0a.s[3]\n" + "fmla v20.4s, b0a.4s, a1a.s[3]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v24.4s, b0a.4s, a2a.s[3]\n" + "fmla v28.4s, b0a.4s, a3a.s[3]\n" + "ldr b0aq, [%[b_ptr]]\n" + + "fmla v17.4s, b1a.4s, a0a.s[3]\n" + "fmla v21.4s, b1a.4s, a1a.s[3]\n" + "fmla v25.4s, b1a.4s, a2a.s[3]\n" + "fmla v29.4s, b1a.4s, a3a.s[3]\n" + "ldr b1aq, [%[b_ptr], #16]\n" + + "fmla v18.4s, b2a.4s, a0a.s[3]\n" + "fmla v22.4s, b2a.4s, a1a.s[3]\n" + "fmla v26.4s, b2a.4s, a2a.s[3]\n" + "fmla v30.4s, b2a.4s, a3a.s[3]\n" + "ldr b2aq, [%[b_ptr], #32]\n" + + "fmla v19.4s, b3a.4s, a0a.s[3]\n" + "fmla v23.4s, b3a.4s, a1a.s[3]\n" + "fmla v27.4s, b3a.4s, a2a.s[3]\n" + "fmla v31.4s, b3a.4s, a3a.s[3]\n" + "bne 1b\n" + + // Skip to here + "4:\n" + + // Detached final iteration + // Unroll 0 + "fmla v16.4s, bb0.4s, a0.s[0]\n" + "fmla v20.4s, bb0.4s, a1.s[0]\n" + "ldr b3aq, [%[b_ptr], #48]\n" + "fmla v24.4s, bb0.4s, a2.s[0]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v28.4s, bb0.4s, a3.s[0]\n" + "ldr b0q, [%[b_ptr]]\n" + + "fmla v17.4s, bb1.4s, a0.s[0]\n" + "cbnz %w[oddk], 2f\n" // Deal with odd K before we load a0a + "fmla v21.4s, bb1.4s, a1.s[0]\n" + "ldr a0aq, [%[a_ptr0], #16]\n" + "fmla v25.4s, bb1.4s, a2.s[0]\n" + "fmla v29.4s, bb1.4s, a3.s[0]\n" + "ldr b1q, [%[b_ptr], #16]\n" + + "fmla v18.4s, bb2.4s, a0.s[0]\n" + "fmla v22.4s, bb2.4s, a1.s[0]\n" + "ldr a1aq, [%[a_ptr1], #16]\n" + "fmla v26.4s, bb2.4s, a2.s[0]\n" + "fmla v30.4s, bb2.4s, a3.s[0]\n" + "ldr b2q, [%[b_ptr], #32]\n" + + "fmla v19.4s, bb3.4s, a0.s[0]\n" + "fmla v23.4s, bb3.4s, a1.s[0]\n" + "ldr a2aq, [%[a_ptr2], #16]\n" + "fmla v27.4s, bb3.4s, a2.s[0]\n" + "fmla v31.4s, bb3.4s, a3.s[0]\n" + "ldr b3q, [%[b_ptr], #48]\n" + + // Unroll 1 + "fmla v16.4s, b0a.4s, a0.s[1]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v20.4s, b0a.4s, a1.s[1]\n" + "ldr a3aq, [%[a_ptr3], #16]\n" + "fmla v24.4s, b0a.4s, a2.s[1]\n" + "fmla v28.4s, b0a.4s, a3.s[1]\n" + "ldr b0aq, [%[b_ptr]]\n" + + "fmla v17.4s, b1a.4s, a0.s[1]\n" + "fmla v21.4s, b1a.4s, a1.s[1]\n" + "subs %w[loops], %w[loops], #1\n" + "fmla v25.4s, b1a.4s, a2.s[1]\n" + "fmla v29.4s, b1a.4s, a3.s[1]\n" + "ldr b1aq, [%[b_ptr], #16]\n" + + "fmla v18.4s, b2a.4s, a0.s[1]\n" + "fmla v22.4s, b2a.4s, a1.s[1]\n" + "fmla v26.4s, b2a.4s, a2.s[1]\n" + "fmla v30.4s, b2a.4s, a3.s[1]\n" + "ldr b2aq, [%[b_ptr], #32]\n" + + "fmla v19.4s, b3a.4s, a0.s[1]\n" + "fmla v23.4s, b3a.4s, a1.s[1]\n" + "fmla v27.4s, b3a.4s, a2.s[1]\n" + "fmla v31.4s, b3a.4s, a3.s[1]\n" + "ldr b3aq, [%[b_ptr], #48]\n" + + // Unroll 2 + "fmla v16.4s, bb0.4s, a0.s[2]\n" + "fmla v20.4s, bb0.4s, a1.s[2]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v24.4s, bb0.4s, a2.s[2]\n" + "fmla v28.4s, bb0.4s, a3.s[2]\n" + "ldr b0q, [%[b_ptr]]\n" + + "fmla v17.4s, bb1.4s, a0.s[2]\n" + "fmla v21.4s, bb1.4s, a1.s[2]\n" + "fmla v25.4s, bb1.4s, a2.s[2]\n" + "fmla v29.4s, bb1.4s, a3.s[2]\n" + "ldr b1q, [%[b_ptr], #16]\n" + + "fmla v18.4s, bb2.4s, a0.s[2]\n" + "fmla v22.4s, bb2.4s, a1.s[2]\n" + "fmla v26.4s, bb2.4s, a2.s[2]\n" + "fmla v30.4s, bb2.4s, a3.s[2]\n" + "ldr b2q, [%[b_ptr], #32]\n" + + "fmla v19.4s, bb3.4s, a0.s[2]\n" + "fmla v23.4s, bb3.4s, a1.s[2]\n" + "fmla v27.4s, bb3.4s, a2.s[2]\n" + "fmla v31.4s, bb3.4s, a3.s[2]\n" + "ldr b3q, [%[b_ptr], #48]\n" + + // Unroll 3 + "fmla v16.4s, b0a.4s, a0.s[3]\n" + "fmla v20.4s, b0a.4s, a1.s[3]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v24.4s, b0a.4s, a2.s[3]\n" + "fmla v28.4s, b0a.4s, a3.s[3]\n" + "ldr b0aq, [%[b_ptr]]\n" + + "fmla v17.4s, b1a.4s, a0.s[3]\n" + "fmla v21.4s, b1a.4s, a1.s[3]\n" + "ldr a3aq, [%[a_ptr3], #16]\n" + "fmla v25.4s, b1a.4s, a2.s[3]\n" + "fmla v29.4s, b1a.4s, a3.s[3]\n" + "ldr b1aq, [%[b_ptr], #16]\n" + + "fmla v18.4s, b2a.4s, a0.s[3]\n" + "fmla v22.4s, b2a.4s, a1.s[3]\n" + "fmla v26.4s, b2a.4s, a2.s[3]\n" + "fmla v30.4s, b2a.4s, a3.s[3]\n" + "ldr b2aq, [%[b_ptr], #32]\n" + + "fmla v19.4s, b3a.4s, a0.s[3]\n" + "fmla v23.4s, b3a.4s, a1.s[3]\n" + "fmla v27.4s, b3a.4s, a2.s[3]\n" + "fmla v31.4s, b3a.4s, a3.s[3]\n" + "ldr b3aq, [%[b_ptr], #48]\n" + + // Unroll 4 + "fmla v16.4s, bb0.4s, a0a.s[0]\n" + "fmla v20.4s, bb0.4s, a1a.s[0]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v24.4s, bb0.4s, a2a.s[0]\n" + "fmla v28.4s, bb0.4s, a3a.s[0]\n" + "ldr b0q, [%[b_ptr]]\n" + + "fmla v17.4s, bb1.4s, a0a.s[0]\n" + "fmla v21.4s, bb1.4s, a1a.s[0]\n" + "fmla v25.4s, bb1.4s, a2a.s[0]\n" + "fmla v29.4s, bb1.4s, a3a.s[0]\n" + "ldr b1q, [%[b_ptr], #16]\n" + + "fmla v18.4s, bb2.4s, a0a.s[0]\n" + "fmla v22.4s, bb2.4s, a1a.s[0]\n" + "fmla v26.4s, bb2.4s, a2a.s[0]\n" + "fmla v30.4s, bb2.4s, a3a.s[0]\n" + "ldr b2q, [%[b_ptr], #32]\n" + + "fmla v19.4s, bb3.4s, a0a.s[0]\n" + "fmla v23.4s, bb3.4s, a1a.s[0]\n" + "fmla v27.4s, bb3.4s, a2a.s[0]\n" + "fmla v31.4s, bb3.4s, a3a.s[0]\n" + "ldr b3q, [%[b_ptr], #48]\n" + + // Unroll 5 + "fmla v16.4s, b0a.4s, a0a.s[1]\n" + "fmla v20.4s, b0a.4s, a1a.s[1]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v24.4s, b0a.4s, a2a.s[1]\n" + "fmla v28.4s, b0a.4s, a3a.s[1]\n" + "ldr b0aq, [%[b_ptr]]\n" + + "fmla v17.4s, b1a.4s, a0a.s[1]\n" + "fmla v21.4s, b1a.4s, a1a.s[1]\n" + "fmla v25.4s, b1a.4s, a2a.s[1]\n" + "fmla v29.4s, b1a.4s, a3a.s[1]\n" + "ldr b1aq, [%[b_ptr], #16]\n" + + "fmla v18.4s, b2a.4s, a0a.s[1]\n" + "fmla v22.4s, b2a.4s, a1a.s[1]\n" + "fmla v26.4s, b2a.4s, a2a.s[1]\n" + "fmla v30.4s, b2a.4s, a3a.s[1]\n" + "ldr b2aq, [%[b_ptr], #32]\n" + + "fmla v19.4s, b3a.4s, a0a.s[1]\n" + "fmla v23.4s, b3a.4s, a1a.s[1]\n" + "fmla v27.4s, b3a.4s, a2a.s[1]\n" + "fmla v31.4s, b3a.4s, a3a.s[1]\n" + "ldr b3aq, [%[b_ptr], #48]\n" + + // Unroll 6 + "fmla v16.4s, bb0.4s, a0a.s[2]\n" + "fmla v20.4s, bb0.4s, a1a.s[2]\n" + "fmla v24.4s, bb0.4s, a2a.s[2]\n" + "fmla v28.4s, bb0.4s, a3a.s[2]\n" + + "fmla v17.4s, bb1.4s, a0a.s[2]\n" + "fmla v21.4s, bb1.4s, a1a.s[2]\n" + "fmla v25.4s, bb1.4s, a2a.s[2]\n" + "fmla v29.4s, bb1.4s, a3a.s[2]\n" + + "fmla v18.4s, bb2.4s, a0a.s[2]\n" + "fmla v22.4s, bb2.4s, a1a.s[2]\n" + "fmla v26.4s, bb2.4s, a2a.s[2]\n" + "fmla v30.4s, bb2.4s, a3a.s[2]\n" + + "fmla v19.4s, bb3.4s, a0a.s[2]\n" + "fmla v23.4s, bb3.4s, a1a.s[2]\n" + "fmla v27.4s, bb3.4s, a2a.s[2]\n" + "fmla v31.4s, bb3.4s, a3a.s[2]\n" + + // Unroll 7 + "fmla v16.4s, b0a.4s, a0a.s[3]\n" + "fmla v17.4s, b1a.4s, a0a.s[3]\n" + "fmla v18.4s, b2a.4s, a0a.s[3]\n" + "fmla v19.4s, b3a.4s, a0a.s[3]\n" + + "fmla v20.4s, b0a.4s, a1a.s[3]\n" + "str q16, [%[c_ptr0]]\n" + "fmla v21.4s, b1a.4s, a1a.s[3]\n" + "str q17, [%[c_ptr0], #16]\n" + "fmla v22.4s, b2a.4s, a1a.s[3]\n" + "str q18, [%[c_ptr0], #32]\n" + "fmla v23.4s, b3a.4s, a1a.s[3]\n" + "str q19, [%[c_ptr0], #48]\n" + + "fmla v24.4s, b0a.4s, a2a.s[3]\n" + "str q20, [%[c_ptr1]]\n" + "fmla v25.4s, b1a.4s, a2a.s[3]\n" + "str q21, [%[c_ptr1], #16]\n" + "fmla v26.4s, b2a.4s, a2a.s[3]\n" + "str q22, [%[c_ptr1], #32]\n" + "fmla v27.4s, b3a.4s, a2a.s[3]\n" + "str q23, [%[c_ptr1], #48]\n" + + "fmla v28.4s, b0a.4s, a3a.s[3]\n" + "str q24, [%[c_ptr2]]\n" + "fmla v29.4s, b1a.4s, a3a.s[3]\n" + "str q25, [%[c_ptr2], #16]\n" + "fmla v30.4s, b2a.4s, a3a.s[3]\n" + "str q26, [%[c_ptr2], #32]\n" + "fmla v31.4s, b3a.4s, a3a.s[3]\n" + "str q27, [%[c_ptr2], #48]\n" + "b 3f\n" + + // Odd K case: Just do 4 more. + "2:\n" + "fmla v21.4s, bb1.4s, a1.s[0]\n" + "fmla v25.4s, bb1.4s, a2.s[0]\n" + "fmla v29.4s, bb1.4s, a3.s[0]\n" + "ldr b1q, [%[b_ptr], #16]\n" + + "fmla v18.4s, bb2.4s, a0.s[0]\n" + "fmla v22.4s, bb2.4s, a1.s[0]\n" + "fmla v26.4s, bb2.4s, a2.s[0]\n" + "fmla v30.4s, bb2.4s, a3.s[0]\n" + "ldr b2q, [%[b_ptr], #32]\n" + + "fmla v19.4s, bb3.4s, a0.s[0]\n" + "fmla v23.4s, bb3.4s, a1.s[0]\n" + "fmla v27.4s, bb3.4s, a2.s[0]\n" + "fmla v31.4s, bb3.4s, a3.s[0]\n" + "ldr b3q, [%[b_ptr], #48]\n" + + // Unroll 1 + "fmla v16.4s, b0a.4s, a0.s[1]\n" + "add %[b_ptr], %[b_ptr], %[ldb]\n" + "fmla v20.4s, b0a.4s, a1.s[1]\n" + "fmla v24.4s, b0a.4s, a2.s[1]\n" + "fmla v28.4s, b0a.4s, a3.s[1]\n" + "ldr b0aq, [%[b_ptr]]\n" + + "fmla v17.4s, b1a.4s, a0.s[1]\n" + "fmla v21.4s, b1a.4s, a1.s[1]\n" + "subs %w[loops], %w[loops], #1\n" + "fmla v25.4s, b1a.4s, a2.s[1]\n" + "fmla v29.4s, b1a.4s, a3.s[1]\n" + "ldr b1aq, [%[b_ptr], #16]\n" + + "fmla v18.4s, b2a.4s, a0.s[1]\n" + "fmla v22.4s, b2a.4s, a1.s[1]\n" + "fmla v26.4s, b2a.4s, a2.s[1]\n" + "fmla v30.4s, b2a.4s, a3.s[1]\n" + "ldr b2aq, [%[b_ptr], #32]\n" + + "fmla v19.4s, b3a.4s, a0.s[1]\n" + "fmla v23.4s, b3a.4s, a1.s[1]\n" + "fmla v27.4s, b3a.4s, a2.s[1]\n" + "fmla v31.4s, b3a.4s, a3.s[1]\n" + "ldr b3aq, [%[b_ptr], #48]\n" + + // Unroll 2 + "fmla v16.4s, bb0.4s, a0.s[2]\n" + "fmla v20.4s, bb0.4s, a1.s[2]\n" + "fmla v24.4s, bb0.4s, a2.s[2]\n" + "fmla v28.4s, bb0.4s, a3.s[2]\n" + + "fmla v17.4s, bb1.4s, a0.s[2]\n" + "fmla v21.4s, bb1.4s, a1.s[2]\n" + "fmla v25.4s, bb1.4s, a2.s[2]\n" + "fmla v29.4s, bb1.4s, a3.s[2]\n" + + "fmla v18.4s, bb2.4s, a0.s[2]\n" + "fmla v22.4s, bb2.4s, a1.s[2]\n" + "fmla v26.4s, bb2.4s, a2.s[2]\n" + "fmla v30.4s, bb2.4s, a3.s[2]\n" + + "fmla v19.4s, bb3.4s, a0.s[2]\n" + "fmla v23.4s, bb3.4s, a1.s[2]\n" + "fmla v27.4s, bb3.4s, a2.s[2]\n" + "fmla v31.4s, bb3.4s, a3.s[2]\n" + + // Unroll 3 + "fmla v16.4s, b0a.4s, a0.s[3]\n" + "fmla v17.4s, b1a.4s, a0.s[3]\n" + "fmla v18.4s, b2a.4s, a0.s[3]\n" + "fmla v19.4s, b3a.4s, a0.s[3]\n" + + "fmla v20.4s, b0a.4s, a1.s[3]\n" + "str q16, [%[c_ptr0]]\n" + "fmla v21.4s, b1a.4s, a1.s[3]\n" + "str q17, [%[c_ptr0], #16]\n" + "fmla v22.4s, b2a.4s, a1.s[3]\n" + "str q18, [%[c_ptr0], #32]\n" + "fmla v23.4s, b3a.4s, a1.s[3]\n" + "str q19, [%[c_ptr0], #48]\n" + + "fmla v24.4s, b0a.4s, a2.s[3]\n" + "str q20, [%[c_ptr1]]\n" + "fmla v25.4s, b1a.4s, a2.s[3]\n" + "str q21, [%[c_ptr1], #16]\n" + "fmla v26.4s, b2a.4s, a2.s[3]\n" + "str q22, [%[c_ptr1], #32]\n" + "fmla v27.4s, b3a.4s, a2.s[3]\n" + "str q23, [%[c_ptr1], #48]\n" + + "fmla v28.4s, b0a.4s, a3.s[3]\n" + "str q24, [%[c_ptr2]]\n" + "fmla v29.4s, b1a.4s, a3.s[3]\n" + "str q25, [%[c_ptr2], #16]\n" + "fmla v30.4s, b2a.4s, a3.s[3]\n" + "str q26, [%[c_ptr2], #32]\n" + "fmla v31.4s, b3a.4s, a3.s[3]\n" + "str q27, [%[c_ptr2], #48]\n" + + "3:\n" + "str q28, [%[c_ptr3]]\n" + "str q29, [%[c_ptr3], #16]\n" + "str q30, [%[c_ptr3], #32]\n" + "str q31, [%[c_ptr3], #48]\n" + + : [a_ptr0] "+r"(a_ptr0), [a_ptr1] "+r"(a_ptr1), [a_ptr2] "+r"(a_ptr2), [a_ptr3] "+r"(a_ptr3), + [b_ptr] "+r"(b_ptr), [loops] "+r"(loops) + : [ldb] "r"(ldbb), [oddk] "r"(oddk), [beta0] "r"(beta0), [betaptr] "r"(&beta), + [c_ptr0] "r"(c_ptr0), [c_ptr1] "r"(c_ptr1), [c_ptr2] "r"(c_ptr2), [c_ptr3] "r"(c_ptr3) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "cc", "memory"); + } + } +} + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp new file mode 100644 index 0000000000..c89514f98e --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_sgemv_pretransposed(const float *, int, const float *, float *, float, int, int); + +// Pretransposed SGEMV strategy class. +class sgemv_pretransposed +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, int, const float *, float *, float, int, int); + + /* Describes the data layout for matrix (A) input */ + + /* Note that often GEMV is expressed as a GEMM with M=1, i.e. A is the + * (row) vector and B is the matrix, but the standard GEMV arrangement + * is matrix A times (column) vector X. "A_transpose" is expressed in + * terms of this standard arrangement, so if the A matrix is in fact the + * B matrix from a GEMM call, the sense of the transpose needs to be + * reversed. */ + static const int A_interleave = 32; + static const int A_block = 1; + static const bool A_transpose = false; + + /* Kernel blocking parameters */ + static const int out_width = 32; + static const int k_unroll = 1; + + kern_type kernel = a64_sgemv_pretransposed; + + sgemv_pretransposed(const CPUInfo *ci) + { + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp new file mode 100644 index 0000000000..290759822a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp @@ -0,0 +1,794 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm +{ +void a64_sgemv_pretransposed(const float *A, int lda, const float *X, float *Y, float beta, int M, int N) +{ + const bool beta0 = (beta == 0.0f); + const bool beta1 = (beta == 1.0f); + + for(int x = 0; x < N; x += 32) + { + float *y_ptr = Y + x; + + // How many elements are we processing in this loop? + int l = std::min(N - x, 32); + + register float32x4_t r0 asm("v24"); + register float32x4_t r1 asm("v25"); + register float32x4_t r2 asm("v26"); + register float32x4_t r3 asm("v27"); + register float32x4_t r4 asm("v28"); + register float32x4_t r5 asm("v29"); + register float32x4_t r6 asm("v30"); + register float32x4_t r7 asm("v31"); + + register float32x4_t x0 asm("v0"); + register float32x4_t x0a asm("v1"); + + const float *x_ptr = X; + const float *a_ptr = A + ((x / 32) * lda); + + if(beta0) + { + r0 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = vdupq_n_f32(0.0f); + } + else + { + if(l == 32) + { + // Fastest path - load all 8 vectors + r0 = vld1q_f32(y_ptr); + r1 = vld1q_f32(y_ptr + 4); + r2 = vld1q_f32(y_ptr + 8); + r3 = vld1q_f32(y_ptr + 12); + r4 = vld1q_f32(y_ptr + 16); + r5 = vld1q_f32(y_ptr + 20); + r6 = vld1q_f32(y_ptr + 24); + r7 = vld1q_f32(y_ptr + 28); + } + else + { + // Slow case - leftovers. Note that we don't care about + // out-of-range vectors and lanes as we will throw them away at + // the end. + int vecs = l / 4; // How many leftover vectors? + int oddbits = l % 4; // And how many odd single values? + + if(oddbits) + { + // Load the outstanding odd values into a vector first + float32x4_t oddvec = vdupq_n_f32(0.0f); // This does not really need to be initialized, but the compiler has a hard time with that. + float *oddbase = y_ptr + l - oddbits; + + switch(oddbits) + { + case 3: + oddvec = vld1q_lane_f32(oddbase + 2, oddvec, 2); + // fall through + case 2: + oddvec = vld1q_lane_f32(oddbase + 1, oddvec, 1); + // fall through + case 1: + oddvec = vld1q_lane_f32(oddbase, oddvec, 0); + break; + + default: + UNREACHABLE("Impossible case in switch."); + } + + // Now load the whole vectors, putting the oddments in when we run out. + do + { + if(vecs == 0) + { + r0 = oddvec; + break; + } + + r0 = vld1q_f32(y_ptr); + if(--vecs == 0) + { + r1 = oddvec; + break; + } + + r1 = vld1q_f32(y_ptr + 4); + if(--vecs == 0) + { + r2 = oddvec; + break; + } + + r2 = vld1q_f32(y_ptr + 8); + if(--vecs == 0) + { + r3 = oddvec; + break; + } + + r3 = vld1q_f32(y_ptr + 12); + if(--vecs == 0) + { + r4 = oddvec; + break; + } + + r4 = vld1q_f32(y_ptr + 16); + if(--vecs == 0) + { + r5 = oddvec; + break; + } + + r5 = vld1q_f32(y_ptr + 20); + if(--vecs == 0) + { + r6 = oddvec; + break; + } + + r6 = vld1q_f32(y_ptr + 24); + r7 = oddvec; + } + while(0); + } + else + { + // Slightly less slow path - just load the whole vectors + do + { + // It can't be the case that oddbits==0 AND vecs==0 or we wouldn't be here. + if(vecs == 0) + { + UNREACHABLE("Impossible lack of work to do"); + } + + r0 = vld1q_f32(y_ptr); + if(--vecs == 0) + { + break; + } + + r1 = vld1q_f32(y_ptr + 4); + if(--vecs == 0) + { + break; + } + + r2 = vld1q_f32(y_ptr + 8); + if(--vecs == 0) + { + break; + } + + r3 = vld1q_f32(y_ptr + 12); + if(--vecs == 0) + { + break; + } + + r4 = vld1q_f32(y_ptr + 16); + if(--vecs == 0) + { + break; + } + + r5 = vld1q_f32(y_ptr + 20); + if(--vecs == 0) + { + break; + } + + r6 = vld1q_f32(y_ptr + 24); + } + while(0); + } + } + + if(!beta1) + { + const float32x4_t vb = vdupq_n_f32(beta); + + r0 = vmulq_f32(r0, vb); + r1 = vmulq_f32(r1, vb); + r2 = vmulq_f32(r2, vb); + r3 = vmulq_f32(r3, vb); + r4 = vmulq_f32(r4, vb); + r5 = vmulq_f32(r5, vb); + r6 = vmulq_f32(r6, vb); + r7 = vmulq_f32(r7, vb); + } + } + + if(M >= 8) + { + int k = (M / 8) - 1; + x0 = vld1q_f32(x_ptr); + + __asm __volatile( + "ldr q2, [%[a_ptr], #0]\n" + "ldr q3, [%[a_ptr], #16]\n" + "ldr q4, [%[a_ptr], #32]\n" + "ldr q5, [%[a_ptr], #48]\n" + "ldr q6, [%[a_ptr], #64]\n" + "ldr q7, [%[a_ptr], #80]\n" + "ldr q8, [%[a_ptr], #96]\n" + "ldr q9, [%[a_ptr], #112]\n" + "ldr q10, [%[a_ptr], #128]\n" + "ldr q11, [%[a_ptr], #144]\n" + "ldr q12, [%[a_ptr], #160]\n" + "ldr q13, [%[a_ptr], #176]\n" + "ldr q14, [%[a_ptr], #192]\n" + "ldr q15, [%[a_ptr], #208]\n" + "ldr q16, [%[a_ptr], #224]\n" + "ldr q17, [%[a_ptr], #240]\n" + "ldr q18, [%[a_ptr], #256]\n" + "ldr q19, [%[a_ptr], #272]\n" + "ldr q20, [%[a_ptr], #288]\n" + "ldr q21, [%[a_ptr], #304]\n" + "ldr q22, [%[a_ptr], #320]\n" + "ldr q23, [%[a_ptr], #336]\n" ASM_PREFETCH("[%[a_ptr], #384]") + ASM_PREFETCH("[%[a_ptr], #448]") + ASM_PREFETCH("[%[a_ptr], #512]") + ASM_PREFETCH("[%[a_ptr], #576]") + ASM_PREFETCH("[%[a_ptr], #640]") + ASM_PREFETCH("[%[a_ptr], #704]") + ASM_PREFETCH("[%[a_ptr], #768]") + ASM_PREFETCH("[%[a_ptr], #832]") + ASM_PREFETCH("[%[a_ptr], #896]") + ASM_PREFETCH("[%[a_ptr], #960]") + ASM_PREFETCH("[%[a_ptr], #1024]") + ASM_PREFETCH("[%[a_ptr], #1088]") + ASM_PREFETCH("[%[a_ptr], #1152]") + ASM_PREFETCH("[%[a_ptr], #1216]") + ASM_PREFETCH("[%[a_ptr], #1280]") + ASM_PREFETCH("[%[a_ptr], #1344]") + ASM_PREFETCH("[%[a_ptr], #1408]") + ASM_PREFETCH("[%[a_ptr], #1472]") + ASM_PREFETCH("[%[a_ptr], #1536]") + ASM_PREFETCH("[%[a_ptr], #1600]") + ASM_PREFETCH("[%[a_ptr], #1664]") + ASM_PREFETCH("[%[a_ptr], #1728]") + ASM_PREFETCH("[%[a_ptr], #1792]") + ASM_PREFETCH("[%[a_ptr], #1856]") + ASM_PREFETCH("[%[a_ptr], #1920]") + ASM_PREFETCH("[%[a_ptr], #1984]") + "add %[a_ptr], %[a_ptr], #352\n" + + "cbz %w[k], 2f\n" + + "1:\n" + // Unroll 0 + "fmla %[r0].4s, v2.4s, %[x0].s[0]\n" + "ldr %q[x0a], [%[x_ptr], #16]\n" + "fmla %[r1].4s, v3.4s, %[x0].s[0]\n" + "ldr q3, [%[a_ptr], #0]\n" + "subs %w[k], %w[k], #1\n" + "fmla %[r2].4s, v4.4s, %[x0].s[0]\n" + "ldr q4, [%[a_ptr], #16]\n" + "fmla %[r3].4s, v5.4s, %[x0].s[0]\n" + "ldr q5, [%[a_ptr], #32]\n" + "add %[x_ptr], %[x_ptr], #32\n" ASM_PREFETCH("[%[a_ptr], #1664]") + "fmla %[r4].4s, v6.4s, %[x0].s[0]\n" + "ldr q6, [%[a_ptr], #48]\n" + "fmla %[r5].4s, v7.4s, %[x0].s[0]\n" + "ldr q7, [%[a_ptr], #64]\n" + "fmla %[r6].4s, v8.4s, %[x0].s[0]\n" + "ldr q8, [%[a_ptr], #80]\n" + "fmla %[r7].4s, v9.4s, %[x0].s[0]\n" + "ldr q9, [%[a_ptr], #96]\n" ASM_PREFETCH("[%[a_ptr], #1728]") + + // Unroll 1 + "fmla %[r0].4s, v10.4s, %[x0].s[1]\n" + "ldr q10, [%[a_ptr], #112]\n" + "fmla %[r1].4s, v11.4s, %[x0].s[1]\n" + "ldr q11, [%[a_ptr], #128]\n" + "fmla %[r2].4s, v12.4s, %[x0].s[1]\n" + "ldr q12, [%[a_ptr], #144]\n" + "fmla %[r3].4s, v13.4s, %[x0].s[1]\n" + "ldr q13, [%[a_ptr], #160]\n" ASM_PREFETCH("[%[a_ptr], #1792]") + "fmla %[r4].4s, v14.4s, %[x0].s[1]\n" + "ldr q14, [%[a_ptr], #176]\n" + "fmla %[r5].4s, v15.4s, %[x0].s[1]\n" + "ldr q15, [%[a_ptr], #192]\n" + "fmla %[r6].4s, v16.4s, %[x0].s[1]\n" + "ldr q16, [%[a_ptr], #208]\n" + "fmla %[r7].4s, v17.4s, %[x0].s[1]\n" + "ldr q17, [%[a_ptr], #224]\n" ASM_PREFETCH("[%[a_ptr], #1856]") + + // Unroll 2 + "fmla %[r0].4s, v18.4s, %[x0].s[2]\n" + "ldr q18, [%[a_ptr], #240]\n" + "fmla %[r1].4s, v19.4s, %[x0].s[2]\n" + "ldr q19, [%[a_ptr], #256]\n" + "fmla %[r2].4s, v20.4s, %[x0].s[2]\n" + "ldr q20, [%[a_ptr], #272]\n" + "fmla %[r3].4s, v21.4s, %[x0].s[2]\n" + "ldr q21, [%[a_ptr], #288]\n" ASM_PREFETCH("[%[a_ptr], #1920]") + "fmla %[r4].4s, v22.4s, %[x0].s[2]\n" + "ldr q22, [%[a_ptr], #304]\n" + "fmla %[r5].4s, v23.4s, %[x0].s[2]\n" + "ldr q23, [%[a_ptr], #320]\n" + "fmla %[r6].4s, v3.4s, %[x0].s[2]\n" + "ldr q2, [%[a_ptr], #336]\n" + "ldr q3, [%[a_ptr], #352]\n" + "fmla %[r7].4s, v4.4s, %[x0].s[2]\n" + "ldr q4, [%[a_ptr], #368]\n" ASM_PREFETCH("[%[a_ptr], #1984]") + + // Unroll 3 + "fmla %[r0].4s, v5.4s, %[x0].s[3]\n" + "ldr q5, [%[a_ptr], #384]\n" + "fmla %[r1].4s, v6.4s, %[x0].s[3]\n" + "ldr q6, [%[a_ptr], #400]\n" + "fmla %[r2].4s, v7.4s, %[x0].s[3]\n" + "ldr q7, [%[a_ptr], #416]\n" + "fmla %[r3].4s, v8.4s, %[x0].s[3]\n" ASM_PREFETCH("[%[a_ptr], #2048]") + "ldr q8, [%[a_ptr], #432]\n" + "fmla %[r4].4s, v9.4s, %[x0].s[3]\n" + "ldr q9, [%[a_ptr], #448]\n" + "fmla %[r5].4s, v10.4s, %[x0].s[3]\n" + "ldr q10, [%[a_ptr], #464]\n" + "fmla %[r6].4s, v11.4s, %[x0].s[3]\n" + "ldr q11, [%[a_ptr], #480]\n" + "fmla %[r7].4s, v12.4s, %[x0].s[3]\n" + "ldr q12, [%[a_ptr], #496]\n" ASM_PREFETCH("[%[a_ptr], #2112]") + + // Unroll 4 + "fmla %[r0].4s, v13.4s, %[x0a].s[0]\n" + "ldr %q[x0], [%[x_ptr]]\n" + "fmla %[r1].4s, v14.4s, %[x0a].s[0]\n" + "ldr q14, [%[a_ptr], #512]\n" + "fmla %[r2].4s, v15.4s, %[x0a].s[0]\n" + "ldr q15, [%[a_ptr], #528]\n" + "fmla %[r3].4s, v16.4s, %[x0a].s[0]\n" ASM_PREFETCH("[%[a_ptr], #2176]") + "ldr q16, [%[a_ptr], #544]\n" + "fmla %[r4].4s, v17.4s, %[x0a].s[0]\n" + "ldr q17, [%[a_ptr], #560]\n" + "fmla %[r5].4s, v18.4s, %[x0a].s[0]\n" + "ldr q18, [%[a_ptr], #576]\n" + "fmla %[r6].4s, v19.4s, %[x0a].s[0]\n" + "ldr q19, [%[a_ptr], #592]\n" + "fmla %[r7].4s, v20.4s, %[x0a].s[0]\n" + "ldr q20, [%[a_ptr], #608]\n" ASM_PREFETCH("[%[a_ptr], #2240]") + + // Unroll 5 + "fmla %[r0].4s, v21.4s, %[x0a].s[1]\n" + "ldr q21, [%[a_ptr], #624]\n" + "fmla %[r1].4s, v22.4s, %[x0a].s[1]\n" + "ldr q22, [%[a_ptr], #640]\n" + "fmla %[r2].4s, v23.4s, %[x0a].s[1]\n" + "ldr q23, [%[a_ptr], #656]\n" + "fmla %[r3].4s, v2.4s, %[x0a].s[1]\n" + "ldr q2, [%[a_ptr], #672]\n" ASM_PREFETCH("[%[a_ptr], #2304]") + "fmla %[r4].4s, v3.4s, %[x0a].s[1]\n" + "ldr q3, [%[a_ptr], #688]\n" + "fmla %[r5].4s, v4.4s, %[x0a].s[1]\n" + "ldr q4, [%[a_ptr], #704]\n" + "fmla %[r6].4s, v5.4s, %[x0a].s[1]\n" + "ldr q5, [%[a_ptr], #720]\n" + "fmla %[r7].4s, v6.4s, %[x0a].s[1]\n" + "ldr q6, [%[a_ptr], #736]\n" ASM_PREFETCH("[%[a_ptr], #2368]") + + // Unroll 6 + "fmla %[r0].4s, v7.4s, %[x0a].s[2]\n" + "ldr q7, [%[a_ptr], #752]\n" + "fmla %[r1].4s, v8.4s, %[x0a].s[2]\n" + "ldr q8, [%[a_ptr], #768]\n" + "fmla %[r2].4s, v9.4s, %[x0a].s[2]\n" + "ldr q9, [%[a_ptr], #784]\n" + "fmla %[r3].4s, v10.4s, %[x0a].s[2]\n" + "ldr q10, [%[a_ptr], #800]\n" ASM_PREFETCH("[%[a_ptr], #2432]") + "fmla %[r4].4s, v11.4s, %[x0a].s[2]\n" + "ldr q11, [%[a_ptr], #816]\n" + "fmla %[r5].4s, v12.4s, %[x0a].s[2]\n" + "ldr q12, [%[a_ptr], #832]\n" + "fmla %[r6].4s, v14.4s, %[x0a].s[2]\n" + "ldr q13, [%[a_ptr], #848]\n" + "ldr q14, [%[a_ptr], #864]\n" + "fmla %[r7].4s, v15.4s, %[x0a].s[2]\n" + "ldr q15, [%[a_ptr], #880]\n" ASM_PREFETCH("[%[a_ptr], #2496]") + + // Unroll 7 + "fmla %[r0].4s, v16.4s, %[x0a].s[3]\n" + "ldr q16, [%[a_ptr], #896]\n" + "fmla %[r1].4s, v17.4s, %[x0a].s[3]\n" + "ldr q17, [%[a_ptr], #912]\n" + "fmla %[r2].4s, v18.4s, %[x0a].s[3]\n" + "ldr q18, [%[a_ptr], #928]\n" + "fmla %[r3].4s, v19.4s, %[x0a].s[3]\n" ASM_PREFETCH("[%[a_ptr], #2560]") + "ldr q19, [%[a_ptr], #944]\n" + "fmla %[r4].4s, v20.4s, %[x0a].s[3]\n" + "ldr q20, [%[a_ptr], #960]\n" + "fmla %[r5].4s, v21.4s, %[x0a].s[3]\n" + "ldr q21, [%[a_ptr], #976]\n" + "add %[a_ptr], %[a_ptr], #1024\n" + "fmla %[r6].4s, v22.4s, %[x0a].s[3]\n" + "ldr q22, [%[a_ptr], #-32]\n" + "fmla %[r7].4s, v23.4s, %[x0a].s[3]\n" + "ldr q23, [%[a_ptr], #-16]\n" ASM_PREFETCH("[%[a_ptr], #1600]") + "bne 1b\n" + + // Detached final iteration + "2:\n" + + // Unroll 0 + "fmla %[r0].4s, v2.4s, %[x0].s[0]\n" + "ldr %q[x0a], [%[x_ptr], #16]\n" + "fmla %[r1].4s, v3.4s, %[x0].s[0]\n" + "ldr q3, [%[a_ptr], #0]\n" + "subs %w[k], %w[k], #1\n" + "fmla %[r2].4s, v4.4s, %[x0].s[0]\n" + "ldr q4, [%[a_ptr], #16]\n" + "fmla %[r3].4s, v5.4s, %[x0].s[0]\n" + "ldr q5, [%[a_ptr], #32]\n" + "add %[x_ptr], %[x_ptr], #32\n" + "fmla %[r4].4s, v6.4s, %[x0].s[0]\n" + "ldr q6, [%[a_ptr], #48]\n" + "fmla %[r5].4s, v7.4s, %[x0].s[0]\n" + "ldr q7, [%[a_ptr], #64]\n" + "fmla %[r6].4s, v8.4s, %[x0].s[0]\n" + "ldr q8, [%[a_ptr], #80]\n" + "fmla %[r7].4s, v9.4s, %[x0].s[0]\n" + "ldr q9, [%[a_ptr], #96]\n" + + // Unroll 1 + "fmla %[r0].4s, v10.4s, %[x0].s[1]\n" + "ldr q10, [%[a_ptr], #112]\n" + "fmla %[r1].4s, v11.4s, %[x0].s[1]\n" + "ldr q11, [%[a_ptr], #128]\n" + "fmla %[r2].4s, v12.4s, %[x0].s[1]\n" + "ldr q12, [%[a_ptr], #144]\n" + "fmla %[r3].4s, v13.4s, %[x0].s[1]\n" + "ldr q13, [%[a_ptr], #160]\n" + "fmla %[r4].4s, v14.4s, %[x0].s[1]\n" + "ldr q14, [%[a_ptr], #176]\n" + "fmla %[r5].4s, v15.4s, %[x0].s[1]\n" + "ldr q15, [%[a_ptr], #192]\n" + "fmla %[r6].4s, v16.4s, %[x0].s[1]\n" + "ldr q16, [%[a_ptr], #208]\n" + "fmla %[r7].4s, v17.4s, %[x0].s[1]\n" + "ldr q17, [%[a_ptr], #224]\n" + + // Unroll 2 + "fmla %[r0].4s, v18.4s, %[x0].s[2]\n" + "ldr q18, [%[a_ptr], #240]\n" + "fmla %[r1].4s, v19.4s, %[x0].s[2]\n" + "ldr q19, [%[a_ptr], #256]\n" + "fmla %[r2].4s, v20.4s, %[x0].s[2]\n" + "ldr q20, [%[a_ptr], #272]\n" + "fmla %[r3].4s, v21.4s, %[x0].s[2]\n" + "ldr q21, [%[a_ptr], #288]\n" + "fmla %[r4].4s, v22.4s, %[x0].s[2]\n" + "ldr q22, [%[a_ptr], #304]\n" + "fmla %[r5].4s, v23.4s, %[x0].s[2]\n" + "ldr q23, [%[a_ptr], #320]\n" + "fmla %[r6].4s, v3.4s, %[x0].s[2]\n" + "ldr q2, [%[a_ptr], #336]\n" + "ldr q3, [%[a_ptr], #352]\n" + "fmla %[r7].4s, v4.4s, %[x0].s[2]\n" + "ldr q4, [%[a_ptr], #368]\n" + + // Unroll 3 + "fmla %[r0].4s, v5.4s, %[x0].s[3]\n" + "ldr q5, [%[a_ptr], #384]\n" + "fmla %[r1].4s, v6.4s, %[x0].s[3]\n" + "ldr q6, [%[a_ptr], #400]\n" + "fmla %[r2].4s, v7.4s, %[x0].s[3]\n" + "ldr q7, [%[a_ptr], #416]\n" + "fmla %[r3].4s, v8.4s, %[x0].s[3]\n" + "ldr q8, [%[a_ptr], #432]\n" + "fmla %[r4].4s, v9.4s, %[x0].s[3]\n" + "ldr q9, [%[a_ptr], #448]\n" + "fmla %[r5].4s, v10.4s, %[x0].s[3]\n" + "ldr q10, [%[a_ptr], #464]\n" + "fmla %[r6].4s, v11.4s, %[x0].s[3]\n" + "ldr q11, [%[a_ptr], #480]\n" + "fmla %[r7].4s, v12.4s, %[x0].s[3]\n" + "ldr q12, [%[a_ptr], #496]\n" + + // Unroll 4 + "fmla %[r0].4s, v13.4s, %[x0a].s[0]\n" + "fmla %[r1].4s, v14.4s, %[x0a].s[0]\n" + "ldr q14, [%[a_ptr], #512]\n" + "fmla %[r2].4s, v15.4s, %[x0a].s[0]\n" + "ldr q15, [%[a_ptr], #528]\n" + "fmla %[r3].4s, v16.4s, %[x0a].s[0]\n" + "ldr q16, [%[a_ptr], #544]\n" + "fmla %[r4].4s, v17.4s, %[x0a].s[0]\n" + "ldr q17, [%[a_ptr], #560]\n" + "fmla %[r5].4s, v18.4s, %[x0a].s[0]\n" + "ldr q18, [%[a_ptr], #576]\n" + "fmla %[r6].4s, v19.4s, %[x0a].s[0]\n" + "ldr q19, [%[a_ptr], #592]\n" + "fmla %[r7].4s, v20.4s, %[x0a].s[0]\n" + "ldr q20, [%[a_ptr], #608]\n" + + // Unroll 5 + "fmla %[r0].4s, v21.4s, %[x0a].s[1]\n" + "ldr q21, [%[a_ptr], #624]\n" + "fmla %[r1].4s, v22.4s, %[x0a].s[1]\n" + "ldr q22, [%[a_ptr], #640]\n" + "fmla %[r2].4s, v23.4s, %[x0a].s[1]\n" + "ldr q23, [%[a_ptr], #656]\n" + "fmla %[r3].4s, v2.4s, %[x0a].s[1]\n" + "add %[a_ptr], %[a_ptr], #672\n" + "fmla %[r4].4s, v3.4s, %[x0a].s[1]\n" + "fmla %[r5].4s, v4.4s, %[x0a].s[1]\n" + "fmla %[r6].4s, v5.4s, %[x0a].s[1]\n" + "fmla %[r7].4s, v6.4s, %[x0a].s[1]\n" + + // Unroll 6 + "fmla %[r0].4s, v7.4s, %[x0a].s[2]\n" + "fmla %[r1].4s, v8.4s, %[x0a].s[2]\n" + "fmla %[r2].4s, v9.4s, %[x0a].s[2]\n" + "fmla %[r3].4s, v10.4s, %[x0a].s[2]\n" + "fmla %[r4].4s, v11.4s, %[x0a].s[2]\n" + "fmla %[r5].4s, v12.4s, %[x0a].s[2]\n" + "fmla %[r6].4s, v14.4s, %[x0a].s[2]\n" + "fmla %[r7].4s, v15.4s, %[x0a].s[2]\n" + + // Unroll 7 + "fmla %[r0].4s, v16.4s, %[x0a].s[3]\n" + "fmla %[r1].4s, v17.4s, %[x0a].s[3]\n" + "fmla %[r2].4s, v18.4s, %[x0a].s[3]\n" + "fmla %[r3].4s, v19.4s, %[x0a].s[3]\n" + "fmla %[r4].4s, v20.4s, %[x0a].s[3]\n" + "fmla %[r5].4s, v21.4s, %[x0a].s[3]\n" + "fmla %[r6].4s, v22.4s, %[x0a].s[3]\n" + "fmla %[r7].4s, v23.4s, %[x0a].s[3]\n" + : + [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), + [x0] "+w"(x0), [x0a] "+w"(x0a), [k] "+r"(k), + [r0] "+w"(r0), [r1] "+w"(r1), [r2] "+w"(r2), [r3] "+w"(r3), + [r4] "+w"(r4), [r5] "+w"(r5), [r6] "+w"(r6), [r7] "+w"(r7) + : + : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "cc", "memory"); + } + + // Deal with ragged M + if(M % 8) + { + int l = (M % 8) - 1; + + __asm __volatile( + "ldr q2, [%[a_ptr], #0]\n" + "ldr q3, [%[a_ptr], #16]\n" + "ldr q4, [%[a_ptr], #32]\n" + "ldr q5, [%[a_ptr], #48]\n" + "ldr q6, [%[a_ptr], #64]\n" + "ldr q7, [%[a_ptr], #80]\n" + "ldr q8, [%[a_ptr], #96]\n" + "ldr q9, [%[a_ptr], #112]\n" + "ldr %s[x0], [%[x_ptr]]\n" + "add %[a_ptr], %[a_ptr], #128\n" + "add %[x_ptr], %[x_ptr], #4\n" + + "cbz %w[l], 2f\n" + + "1:\n" + "fmla %[r0].4s, v2.4s, %[x0].s[0]\n" + "ldr q2, [%[a_ptr], #0]\n" + "subs %w[l], %w[l], #1\n" + "fmla %[r1].4s, v3.4s, %[x0].s[0]\n" + "ldr q3, [%[a_ptr], #16]\n" + "fmla %[r2].4s, v4.4s, %[x0].s[0]\n" + "ldr q4, [%[a_ptr], #32]\n" + "fmla %[r3].4s, v5.4s, %[x0].s[0]\n" + "ldr q5, [%[a_ptr], #48]\n" + "fmla %[r4].4s, v6.4s, %[x0].s[0]\n" + "ldr q6, [%[a_ptr], #64]\n" + "fmla %[r5].4s, v7.4s, %[x0].s[0]\n" + "ldr q7, [%[a_ptr], #80]\n" + "fmla %[r6].4s, v8.4s, %[x0].s[0]\n" + "ldr q8, [%[a_ptr], #96]\n" + "fmla %[r7].4s, v9.4s, %[x0].s[0]\n" + "ldr q9, [%[a_ptr], #112]\n" + "ldr %s[x0], [%[x_ptr]]\n" + "add %[a_ptr], %[a_ptr], #128\n" + "add %[x_ptr], %[x_ptr], #4\n" + "bne 1b\n" + + "2:\n" + + "fmla %[r0].4s, v2.4s, %[x0].s[0]\n" + "fmla %[r1].4s, v3.4s, %[x0].s[0]\n" + "fmla %[r2].4s, v4.4s, %[x0].s[0]\n" + "fmla %[r3].4s, v5.4s, %[x0].s[0]\n" + "fmla %[r4].4s, v6.4s, %[x0].s[0]\n" + "fmla %[r5].4s, v7.4s, %[x0].s[0]\n" + "fmla %[r6].4s, v8.4s, %[x0].s[0]\n" + "fmla %[r7].4s, v9.4s, %[x0].s[0]\n" + : + [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), + [x0] "+w"(x0), [l] "+r"(l), + [r0] "+w"(r0), [r1] "+w"(r1), [r2] "+w"(r2), [r3] "+w"(r3), + [r4] "+w"(r4), [r5] "+w"(r5), [r6] "+w"(r6), [r7] "+w"(r7) + : + : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc", "memory"); + } + + if(l == 32) + { + // Fast path + vst1q_f32(y_ptr, r0); + vst1q_f32(y_ptr + 4, r1); + vst1q_f32(y_ptr + 8, r2); + vst1q_f32(y_ptr + 12, r3); + vst1q_f32(y_ptr + 16, r4); + vst1q_f32(y_ptr + 20, r5); + vst1q_f32(y_ptr + 24, r6); + vst1q_f32(y_ptr + 28, r7); + } + else + { + int vecs = l / 4; + int oddbits = l % 4; + + if(oddbits) + { + // As above - slowest path deals with vectors plus odd bits + float32x4_t oddvec; + + do + { + if(vecs == 0) + { + oddvec = r0; + break; + } + + vst1q_f32(y_ptr, r0); + if(--vecs == 0) + { + oddvec = r1; + break; + } + + vst1q_f32(y_ptr + 4, r1); + if(--vecs == 0) + { + oddvec = r2; + break; + } + + vst1q_f32(y_ptr + 8, r2); + if(--vecs == 0) + { + oddvec = r3; + break; + } + + vst1q_f32(y_ptr + 12, r3); + if(--vecs == 0) + { + oddvec = r4; + break; + } + + vst1q_f32(y_ptr + 16, r4); + if(--vecs == 0) + { + oddvec = r5; + break; + } + + vst1q_f32(y_ptr + 20, r5); + if(--vecs == 0) + { + oddvec = r6; + break; + } + + vst1q_f32(y_ptr + 24, r6); + oddvec = r7; + } + while(0); + + float *oddbase = y_ptr + l - oddbits; + + switch(oddbits) + { + case 3: + vst1q_lane_f32(oddbase + 2, oddvec, 2); + // fall through + case 2: + vst1q_lane_f32(oddbase + 1, oddvec, 1); + // fall through + case 1: + vst1q_lane_f32(oddbase, oddvec, 0); + break; + + default: + // oddbits must be 1, 2 or 3. + UNREACHABLE("Impossible case in switch."); + } + } + else + { + // As above - medium path deals with vectors only + do + { + if(vecs == 0) + { + UNREACHABLE("vecs and oddbits can't both be 0"); + } + + vst1q_f32(y_ptr, r0); + if(--vecs == 0) + { + break; + } + + vst1q_f32(y_ptr + 4, r1); + if(--vecs == 0) + { + break; + } + + vst1q_f32(y_ptr + 8, r2); + if(--vecs == 0) + { + break; + } + + vst1q_f32(y_ptr + 12, r3); + if(--vecs == 0) + { + break; + } + + vst1q_f32(y_ptr + 16, r4); + if(--vecs == 0) + { + break; + } + + vst1q_f32(y_ptr + 20, r5); + if(--vecs == 0) + { + break; + } + + vst1q_f32(y_ptr + 24, r6); + } + while(0); + } + } + } +} + +} // namespace arm_gemm + +#endif // aarch64 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp new file mode 100644 index 0000000000..5b9bd72c89 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +namespace arm_gemm +{ +// Actual kernel implementations +void a64_sgemv_trans(const float *, const float *, float *, float, int, int, int); + +// Transposed SGEMV strategy class. +class sgemv_trans +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int); + + /* Kernel blocking parameters */ + static const int out_width = 96; + static const int k_unroll = 1; + + kern_type kernel = a64_sgemv_trans; + + sgemv_trans(const CPUInfo *ci) + { + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp new file mode 100644 index 0000000000..3309baff3a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp @@ -0,0 +1,913 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include + +#include + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +// Kernel implementation - transposed GEMV +// +// The kernel will process "M" rows of A (= steps of dot product) and "N" +// columns (= dot products total) +// +// General plan is to do as many columns simultaneously as possible - a +// reasonable limit is half the NEON regfile = 64 total accumulators. +// +// It's possible that messing around with sub-blocking M and N can yield +// higher performance, but that's left to the outer loop. In this kernel we +// process all of M at the same time. + +// How far ahead to prefetch for the first and subsequent prefetches. +// These values work for A72 on JunoR2... + +#define FIRST_PFD 9 +#define PFD 6 + +namespace arm_gemm +{ +void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float alpha, int lda, int M, int N) +{ + const float *a_ptr_base = Astart; + float *y_ptr = Ystart; + + register const float32x4_t va asm("v1") = vdupq_n_f32(alpha); + + int firstpfd = FIRST_PFD; + if(firstpfd > M) + { + firstpfd = (M - 1); + } + + int pfd = PFD; + if(pfd > M) + { + pfd = (M - 1); + } + + ptrdiff_t jump = lda * sizeof(int); + + for(; N >= 96; N -= 96) + { + int k = M - 1; + + const float *a_ptr = a_ptr_base; + const float *x_ptr = Xstart; + const float *pf_ptr = a_ptr; + const float *firstpf_ptr = a_ptr; + const float *pf_limit = a_ptr + (M * lda); + + for(int i = 0; i < firstpfd; i++) + { + prefetch_1x(firstpf_ptr); + firstpf_ptr += lda; + } + + for(int i = 0; i < pfd; i++) + { + prefetch_5x(pf_ptr + 16); + pf_ptr += lda; + } + + a_ptr_base += 96; + + __asm __volatile( + "movi v8.4s,#0x0\n" + "ldr w0, [%[x_ptr]]\n" + "movi v9.4s,#0x0\n" + "ldr q2, [%[a_ptr], #0]\n" + "movi v10.4s,#0x0\n" + "ldr q3, [%[a_ptr], #0x10]\n" + "movi v11.4s,#0x0\n" + "ldr q4, [%[a_ptr], #0x20]\n" + "movi v12.4s,#0x0\n" + "ldr q5, [%[a_ptr], #0x30]\n" + "movi v13.4s,#0x0\n" + "ldr q6, [%[a_ptr], #0x40]\n" + "movi v14.4s,#0x0\n" + "ldr q7, [%[a_ptr], #0x50]\n" + "movi v15.4s,#0x0\n" ASM_PREFETCH("[%[firstpf_ptr]]") + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #64]") + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #128]") + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #192]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #256]") + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #320]") + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "add %[pf_ptr], %[pf_ptr], %[jump]\n" + "movi v28.4s, #0x0\n" + "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip everything if there are no iterations of the main loop to do. + "cbz %w[k], 10f\n" + + // Loop with all prefetches. Exit this loop when firstpf_ptr + // hits pf_limit. + "1:\n" + "dup v0.4s, w0\n" + "ldr w0, [%[x_ptr], #4]\n" + "add %[x_ptr], %[x_ptr], #0x4\n" + "fmla v8.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0x60]\n" + "fmla v9.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0x70]\n" ASM_PREFETCH("[%[firstpf_ptr]]") + "fmla v10.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0x80]\n" + "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n" + "fmla v11.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0x90]\n" + "sub %w[k], %w[k], #1\n" ASM_PREFETCH("[%[x_ptr], #128]") + "fmla v12.4s, v6.4s, v0.4s\n" + "ldr q6, [%[a_ptr], #0xa0]\n" + "fmla v13.4s, v7.4s, v0.4s\n" + "ldr q7, [%[a_ptr], #0xb0]\n" ASM_PREFETCH("[%[pf_ptr], #0x40]") + "fmla v14.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0xc0]\n" + "fmla v15.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0xd0]\n" + "fmla v16.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0xe0]\n" + "fmla v17.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0xf0]\n" ASM_PREFETCH("[%[pf_ptr], #0x80]") + "fmla v18.4s, v6.4s, v0.4s\n" + "ldr q6, [%[a_ptr], #0x100]\n" + "fmla v19.4s, v7.4s, v0.4s\n" + "ldr q7, [%[a_ptr], #0x110]\n" + "fmla v20.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0x120]\n" + "fmla v21.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0x130]\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]") + "fmla v22.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0x140]\n" + "fmla v23.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0x150]\n" + "fmla v24.4s, v6.4s, v0.4s\n" + "ldr q6, [%[a_ptr], #0x160]\n" + "fmla v25.4s, v7.4s, v0.4s\n" + "ldr q7, [%[a_ptr], #0x170]\n" ASM_PREFETCH("[%[pf_ptr], #0x100]") + "add %[a_ptr], %[a_ptr], %[jump]\n" + "fmla v26.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0x00]\n" + "fmla v27.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0x10]\n" + "fmla v28.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0x20]\n" + "fmla v29.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0x30]\n" ASM_PREFETCH("[%[pf_ptr], #0x140]") + "fmla v30.4s, v6.4s, v0.4s\n" + "add %[pf_ptr], %[pf_ptr], %[jump]\n" + "ldr q6, [%[a_ptr], #0x40]\n" + "fmla v31.4s, v7.4s, v0.4s\n" + "cmp %[firstpf_ptr], %[pf_limit]\n" + "ldr q7, [%[a_ptr], #0x50]\n" + "blt 1b\n" + + // Check that there are still "main" prefetches to do. + "cmp %[pf_ptr], %[pf_limit]\n" + "bge 9f\n" + + // Just the main prefetches, exit this loop when pf_ptr hits pf_limit. + "8:\n" + "dup v0.4s, w0\n" + "ldr w0, [%[x_ptr], #4]\n" + "add %[x_ptr], %[x_ptr], #0x4\n" + "fmla v8.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0x60]\n" + "fmla v9.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0x70]\n" + "fmla v10.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0x80]\n" + "fmla v11.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0x90]\n" + "sub %w[k], %w[k], #1\n" ASM_PREFETCH("[%[x_ptr], #128]") + "fmla v12.4s, v6.4s, v0.4s\n" + "ldr q6, [%[a_ptr], #0xa0]\n" + "fmla v13.4s, v7.4s, v0.4s\n" + "ldr q7, [%[a_ptr], #0xb0]\n" ASM_PREFETCH("[%[pf_ptr], #0x40]") + "fmla v14.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0xc0]\n" + "fmla v15.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0xd0]\n" + "fmla v16.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0xe0]\n" + "fmla v17.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0xf0]\n" ASM_PREFETCH("[%[pf_ptr], #0x80]") + "fmla v18.4s, v6.4s, v0.4s\n" + "ldr q6, [%[a_ptr], #0x100]\n" + "fmla v19.4s, v7.4s, v0.4s\n" + "ldr q7, [%[a_ptr], #0x110]\n" + "fmla v20.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0x120]\n" + "fmla v21.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0x130]\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]") + "fmla v22.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0x140]\n" + "fmla v23.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0x150]\n" + "fmla v24.4s, v6.4s, v0.4s\n" + "ldr q6, [%[a_ptr], #0x160]\n" + "fmla v25.4s, v7.4s, v0.4s\n" + "ldr q7, [%[a_ptr], #0x170]\n" ASM_PREFETCH("[%[pf_ptr], #0x100]") + "add %[a_ptr], %[a_ptr], %[jump]\n" + "fmla v26.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0x00]\n" + "fmla v27.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0x10]\n" + "fmla v28.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0x20]\n" + "fmla v29.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0x30]\n" ASM_PREFETCH("[%[pf_ptr], #0x140]") + "fmla v30.4s, v6.4s, v0.4s\n" + "add %[pf_ptr], %[pf_ptr], %[jump]\n" + "ldr q6, [%[a_ptr], #0x40]\n" + "fmla v31.4s, v7.4s, v0.4s\n" + "cmp %[pf_ptr], %[pf_limit]\n" + "ldr q7, [%[a_ptr], #0x50]\n" + "blt 8b\n" + + // Check that there is still work to do. + "9:\n" + "cmp %w[k], #0\n" + "beq 10f\n" + + // Loop without prefetches, exit when k hits 0. + "2:\n" + "dup v0.4s, w0\n" + "ldr w0, [%[x_ptr], #4]\n" + "add %[x_ptr], %[x_ptr], #0x4\n" + "fmla v8.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0x60]\n" + "fmla v9.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0x70]\n" + "fmla v10.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0x80]\n" + "fmla v11.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0x90]\n" + "subs %w[k], %w[k], #1\n" + "fmla v12.4s, v6.4s, v0.4s\n" + "ldr q6, [%[a_ptr], #0xa0]\n" + "fmla v13.4s, v7.4s, v0.4s\n" + "ldr q7, [%[a_ptr], #0xb0]\n" + "fmla v14.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0xc0]\n" + "fmla v15.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0xd0]\n" + "fmla v16.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0xe0]\n" + "fmla v17.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0xf0]\n" + "fmla v18.4s, v6.4s, v0.4s\n" + "ldr q6, [%[a_ptr], #0x100]\n" + "fmla v19.4s, v7.4s, v0.4s\n" + "ldr q7, [%[a_ptr], #0x110]\n" + "fmla v20.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0x120]\n" + "fmla v21.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0x130]\n" + "fmla v22.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0x140]\n" + "fmla v23.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0x150]\n" + "fmla v24.4s, v6.4s, v0.4s\n" + "ldr q6, [%[a_ptr], #0x160]\n" + "fmla v25.4s, v7.4s, v0.4s\n" + "ldr q7, [%[a_ptr], #0x170]\n" + "add %[a_ptr], %[a_ptr], %[jump]\n" + "fmla v26.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0x00]\n" + "fmla v27.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0x10]\n" + "fmla v28.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0x20]\n" + "fmla v29.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0x30]\n" + "fmla v30.4s, v6.4s, v0.4s\n" + "ldr q6, [%[a_ptr], #0x40]\n" + "fmla v31.4s, v7.4s, v0.4s\n" + "ldr q7, [%[a_ptr], #0x50]\n" + "bne 2b\n" + + "10:\n" + + // Final iteration + "dup v0.4s, w0\n" + "fmla v8.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0x60]\n" + "fmla v9.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0x70]\n" + "fmla v10.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0x80]\n" + "fmla v11.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0x90]\n" + "fmla v12.4s, v6.4s, v0.4s\n" + "ldr q6, [%[a_ptr], #0xa0]\n" + "fmla v13.4s, v7.4s, v0.4s\n" + "ldr q7, [%[a_ptr], #0xb0]\n" + "fmla v14.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0xc0]\n" + "fmla v15.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0xd0]\n" + "fmla v16.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0xe0]\n" + "fmla v17.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0xf0]\n" + "fmla v18.4s, v6.4s, v0.4s\n" + "ldr q6, [%[a_ptr], #0x100]\n" + "fmla v19.4s, v7.4s, v0.4s\n" + "ldr q7, [%[a_ptr], #0x110]\n" + "fmla v20.4s, v2.4s, v0.4s\n" + "ldr q2, [%[a_ptr], #0x120]\n" + "fmla v21.4s, v3.4s, v0.4s\n" + "ldr q3, [%[a_ptr], #0x130]\n" + "fmla v22.4s, v4.4s, v0.4s\n" + "ldr q4, [%[a_ptr], #0x140]\n" + "fmla v23.4s, v5.4s, v0.4s\n" + "ldr q5, [%[a_ptr], #0x150]\n" + "fmla v24.4s, v6.4s, v0.4s\n" + "ldr q6, [%[a_ptr], #0x160]\n" + "fmla v25.4s, v7.4s, v0.4s\n" + "ldr q7, [%[a_ptr], #0x170]\n" + "fmla v26.4s, v2.4s, v0.4s\n" + "ldr q2, [%[y_ptr]]\n" + "fmla v27.4s, v3.4s, v0.4s\n" + "ldr q3, [%[y_ptr], #0x10]\n" + "fmla v28.4s, v4.4s, v0.4s\n" + "ldr q4, [%[y_ptr], #0x20]\n" + "fmla v29.4s, v5.4s, v0.4s\n" + "ldr q5, [%[y_ptr], #0x30]\n" + "fmla v30.4s, v6.4s, v0.4s\n" + "ldr q6, [%[y_ptr], #0x40]\n" + "fmla v31.4s, v7.4s, v0.4s\n" + "ldr q7, [%[y_ptr], #0x50]\n" + + "fmla v2.4s, v8.4s, %[va].4s\n" + "ldr q8, [%[y_ptr], #0x60]\n" + "fmla v3.4s, v9.4s, %[va].4s\n" + "ldr q9, [%[y_ptr], #0x70]\n" + "fmla v4.4s, v10.4s, %[va].4s\n" + "ldr q10, [%[y_ptr], #0x80]\n" + "fmla v5.4s, v11.4s, %[va].4s\n" + "ldr q11, [%[y_ptr], #0x90]\n" + "fmla v6.4s, v12.4s, %[va].4s\n" + "ldr q12, [%[y_ptr], #0xa0]\n" + "str q2, [%[y_ptr], #0x00]\n" + "fmla v7.4s, v13.4s, %[va].4s\n" + "ldr q13, [%[y_ptr], #0xb0]\n" + "str q3, [%[y_ptr], #0x10]\n" + "fmla v8.4s, v14.4s, %[va].4s\n" + "ldr q14, [%[y_ptr], #0xc0]\n" + "str q4, [%[y_ptr], #0x20]\n" + "fmla v9.4s, v15.4s, %[va].4s\n" + "ldr q15, [%[y_ptr], #0xd0]\n" + "str q5, [%[y_ptr], #0x30]\n" + "fmla v10.4s, v16.4s, %[va].4s\n" + "ldr q16, [%[y_ptr], #0xe0]\n" + "str q6, [%[y_ptr], #0x40]\n" + "fmla v11.4s, v17.4s, %[va].4s\n" + "ldr q17, [%[y_ptr], #0xf0]\n" + "str q7, [%[y_ptr], #0x50]\n" + "fmla v12.4s, v18.4s, %[va].4s\n" + "ldr q18, [%[y_ptr], #0x100]\n" + "str q8, [%[y_ptr], #0x60]\n" + "fmla v13.4s, v19.4s, %[va].4s\n" + "ldr q19, [%[y_ptr], #0x110]\n" + "str q9, [%[y_ptr], #0x70]\n" + "fmla v14.4s, v20.4s, %[va].4s\n" + "ldr q20, [%[y_ptr], #0x120]\n" + "str q10, [%[y_ptr], #0x80]\n" + "fmla v15.4s, v21.4s, %[va].4s\n" + "ldr q21, [%[y_ptr], #0x130]\n" + "str q11, [%[y_ptr], #0x90]\n" + "fmla v16.4s, v22.4s, %[va].4s\n" + "ldr q22, [%[y_ptr], #0x140]\n" + "str q12, [%[y_ptr], #0xa0]\n" + "fmla v17.4s, v23.4s, %[va].4s\n" + "ldr q23, [%[y_ptr], #0x150]\n" + "str q13, [%[y_ptr], #0xb0]\n" + "fmla v18.4s, v24.4s, %[va].4s\n" + "ldr q24, [%[y_ptr], #0x160]\n" + "str q14, [%[y_ptr], #0xc0]\n" + "fmla v19.4s, v25.4s, %[va].4s\n" + "ldr q25, [%[y_ptr], #0x170]\n" + "str q15, [%[y_ptr], #0xd0]\n" + "fmla v20.4s, v26.4s, %[va].4s\n" + "str q16, [%[y_ptr], #0xe0]\n" + "fmla v21.4s, v27.4s, %[va].4s\n" + "str q17, [%[y_ptr], #0xf0]\n" + "fmla v22.4s, v28.4s, %[va].4s\n" + "str q18, [%[y_ptr], #0x100]\n" + "fmla v23.4s, v29.4s, %[va].4s\n" + "str q19, [%[y_ptr], #0x110]\n" + "fmla v24.4s, v30.4s, %[va].4s\n" + "str q20, [%[y_ptr], #0x120]\n" + "fmla v25.4s, v31.4s, %[va].4s\n" + "str q21, [%[y_ptr], #0x130]\n" + + "stp q22, q23, [%[y_ptr], #0x140]\n" + "stp q24, q25, [%[y_ptr], #0x160]\n" + "add %[y_ptr], %[y_ptr], #0x180\n" + + : [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k), [pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr) + : [jump] "r"(jump), [va] "w"(va), [pf_limit] "r"(pf_limit) + : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31", "cc"); + } + + if(N > 0) + { + // Handle N tail - up to 95 stragglers. + // This is 0-23 vectors, plus optionally an 64-bit vector and/or a + // single value for the remainder. + + // Independent pointers into the matrix for the odd 2 and odd 1. + // Double up as flag to indicate whether they are needed. + const float *odd2_aptr = NULL; + const float *odd1_aptr = NULL; + + // Figure out how much work we need to do. + int numvecs = N / 4; + int rem = N % 4; + int k = M; + + // Set up pointers for the odd 2/1 if needed. + if(rem >= 2) + { + odd2_aptr = a_ptr_base + (numvecs * 4); + } + + if(rem & 1) + { + odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr == NULL ? 0 : 2); + } + + const float *a_ptr = a_ptr_base; + const float *firstpf_ptr = a_ptr_base; + const float *pf_ptr = a_ptr_base; + const float *pf_limit = a_ptr + (M * lda); + + const float *x_ptr = Xstart; + int vecs = 0; // Working variable to count how many vectors to work on. + int dopf = 1; // Track whether we are doing prefetches. + + // Figure out how many cache lines we need to prefetch each time. + int numpfs = (N + 15) / 16; + + // Do initial prefetches + for(int i = 0; i < firstpfd + 1; i++) + { + prefetch_1x(firstpf_ptr); + firstpf_ptr += lda; + } + + // Do "main" prefetches - adapt number to the number we actually need. + if(numpfs > 1) + { + for(int i = 0; i < pfd + 1; i++) + { + switch(numpfs) + { + case 2: + prefetch_1x(pf_ptr + 16); + break; + + case 3: + prefetch_2x(pf_ptr + 16); + break; + + case 4: + prefetch_3x(pf_ptr + 16); + break; + + case 5: + prefetch_4x(pf_ptr + 16); + break; + + case 6: + prefetch_5x(pf_ptr + 16); + break; + + default: + UNREACHABLE("Impossible."); + } + pf_ptr += lda; + } + } + else + { + // Just disable additional prefetches + dopf = 0; + } + + // Do the real work + __asm __volatile( + // Initialize all the vectors - not worth skipping this if only + // some are needed. + "movi v8.4s,#0x0\n" + "ldr w0, [%[x_ptr]]\n" + "movi v9.4s,#0x0\n" + "movi v10.4s,#0x0\n" + "movi v11.4s,#0x0\n" + "movi v12.4s,#0x0\n" + "movi v13.4s,#0x0\n" + "movi v14.4s,#0x0\n" + "movi v15.4s,#0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v6.2s, #0x0\n" + "movi v5.2s, #0x0\n" + + "1:\n" ASM_PREFETCH("[%[firstpf_ptr]]\n") + "11:\n" + "dup v0.4s, w0\n" + "ldr w0, [%[x_ptr], #4]\n" + "add %[x_ptr], %[x_ptr], #4\n" + + "cbz %w[numvecs], 2f\n" + "mov %w[vecs], %w[numvecs]\n" + + // Vector 0 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x00]\n" + "fmla v8.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 1 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x10]\n" + "fmla v9.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 2 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x20]\n" + "fmla v10.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 3 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x30]\n" + "fmla v11.4s, v7.4s, v0.4s\n" + // Prefetch + "cbz %w[dopf], 3f\n" ASM_PREFETCH("[%[pf_ptr], #0x40]") + "3:\n" + "beq 2f\n" + + // Vector 4 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x40]\n" + "fmla v12.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 5 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x50]\n" + "fmla v13.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 6 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x60]\n" + "fmla v14.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 7 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x70]\n" + "fmla v15.4s, v7.4s, v0.4s\n" + // Prefetch + "cbz %w[dopf], 4f\n" ASM_PREFETCH("[%[pf_ptr], #0x80]") + "4:\n" + "beq 2f\n" + + // Vector 8 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x80]\n" + "fmla v16.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 9 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x90]\n" + "fmla v17.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 10 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0xa0]\n" + "fmla v18.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 11 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0xb0]\n" + "fmla v19.4s, v7.4s, v0.4s\n" + // Prefetch + "cbz %w[dopf], 5f\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]") + "5:\n" + "beq 2f\n" + + // Vector 12 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0xc0]\n" + "fmla v20.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 13 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0xd0]\n" + "fmla v21.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 14 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0xe0]\n" + "fmla v22.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 15 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0xf0]\n" + "fmla v23.4s, v7.4s, v0.4s\n" + // Prefetch + "cbz %w[dopf], 6f\n" ASM_PREFETCH("[%[pf_ptr], #0x100]") + "6:\n" + "beq 2f\n" + + // Vector 16 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x100]\n" + "fmla v24.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 17 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x110]\n" + "fmla v25.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 18 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x120]\n" + "fmla v26.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 19 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x130]\n" + "fmla v27.4s, v7.4s, v0.4s\n" + // Prefetch + "cbz %w[dopf], 7f\n" ASM_PREFETCH("[%[pf_ptr], #0x140]") + "7:\n" + "beq 2f\n" + + // Vector 20 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x140]\n" + "fmla v28.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 21 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x150]\n" + "fmla v29.4s, v7.4s, v0.4s\n" + "beq 2f\n" + // Vector 22 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7,[%[a_ptr], #0x160]\n" + "fmla v30.4s, v7.4s, v0.4s\n" + + "2:\n" + "add %[a_ptr], %[a_ptr], %[jump]\n" + + // Do the odd 2-vector, if needed + "cbz %[odd2_aptr], 8f\n" + "ldr d7, [%[odd2_aptr]]\n" + "fmla v6.2s, v7.2s, v0.2s\n" + "add %[odd2_aptr], %[odd2_aptr], %[jump]\n" + + "8:\n" + // Do the odd 1-vector, if needed + "cbz %[odd1_aptr], 9f\n" + "ldr s7, [%[odd1_aptr]]\n" + "fmla v5.2s, v7.2s, v0.2s\n" + "add %[odd1_aptr], %[odd1_aptr], %[jump]\n" + + // Get out if needed. + "9:\n" + "subs %w[k], %w[k], #1\n" + "beq 10f\n" + + // Update the "main" prefetch pointer, if it strays beyond the limit turn off "dopf" + "add %[pf_ptr], %[pf_ptr], %[jump]\n" + "cmp %[pf_ptr], %[pf_limit]\n" + "csel %w[dopf], %w[dopf], WZR, LT\n" + + // Update the "leading" prefetch pointer, don't do the first + // instruction of the loop if it's over the limit. + "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n" + "cmp %[firstpf_ptr], %[pf_limit]\n" + "blt 1b\n" + "b 11b\n" + + // Now write out the outputs + "10:\n" + "cbz %w[numvecs], 12f\n" + "mov %w[vecs], %w[numvecs]\n" + + // Vector 0 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v8.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 1 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v9.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 2 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v10.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 3 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v11.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 4 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v12.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 5 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v13.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 6 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v14.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 7 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v15.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 8 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v16.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 9 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v17.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 10 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v18.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 11 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v19.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 12 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v20.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 13 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v21.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 14 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v22.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 15 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v23.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 16 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v24.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 17 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v25.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 18 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v26.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 19 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v27.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 20 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v28.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 21 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v29.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + "beq 12f\n" + // Vector 22 + "subs %w[vecs], %w[vecs], #1\n" + "ldr q7, [%[y_ptr]]\n" + "fmla v7.4s, v30.4s, %[va].4s\n" + "str q7, [%[y_ptr]], #0x10\n" + + // Odd 2 + "12:\n" + "cbz %[odd2_aptr], 13f\n" + "ldr d7, [%[y_ptr]]\n" + "fmla v7.2s, v6.2s, %[va].2s\n" + "str d7, [%[y_ptr]], #0x8\n" + + // Odd 1 + "13:\n" + "cbz %[odd1_aptr], 14f\n" + "ldr s7, [%[y_ptr]]\n" + "fmla v7.2s, v5.2s, %[va].2s\n" + "str s7, [%[y_ptr]]\n" + + "14:\n" + : [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k), + [pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr), + [odd1_aptr] "+r"(odd1_aptr), [odd2_aptr] "+r"(odd2_aptr), + [dopf] "+r"(dopf), [vecs] "+r"(vecs) + : [jump] "r"(jump), [va] "w"(va), [pf_limit] "r"(pf_limit), [numvecs] "r"(numvecs) + : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31", "cc"); + } +} + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults.hpp b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp new file mode 100644 index 0000000000..2ab01d680c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +/* As some of the merges need these headers, but are all included in the + * arm_gemm namespace, put these headers here. */ +#include + +#include "asmlib.hpp" +#include "utils.hpp" + +namespace arm_gemm +{ +template +inline void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta) +{ + int full_y_blocks = (ymax - y0) / height; + int y_remainder = (ymax - y0) % height; + int y_blocks = full_y_blocks + (y_remainder ? 1 : 0); + + int full_x_blocks = (xmax - x0) / width; + int x_remainder = (xmax - x0) % width; + int x_blocks = full_x_blocks + (x_remainder ? 1 : 0); + + for(int y_block = 0; y_block < y_blocks; y_block++) + { + int ybase = y0 + (y_block * height); + + int fill_rows = (y_block < full_y_blocks) ? height : y_remainder; + + for(int x_block = 0; x_block < x_blocks; x_block++) + { + int xbase = x0 + (x_block * width); + + int fill_cols = (x_block < full_x_blocks) ? width : x_remainder; + + for(int row = 0; row < fill_rows; row++) + { + for(int col = 0; col < fill_cols; col++) + { + Tout &p = out[(ybase + row) * ldc + xbase + col]; + + p = (p * alpha) + (beta * in[row * width + col]); + } + } + + in += (width * height); + } + } +} + +#include "merges/list.hpp" + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp new file mode 100644 index 0000000000..b44e56499f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __arm__ + +#include + +template <> +inline void MergeResults<8, 6>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) +{ + const float *inptr = in; + prefetch_6x(inptr); + prefetch_6x(inptr + 96); + + float32x4_t av = vdupq_n_f32(alpha); + float32x4_t bv = vdupq_n_f32(beta); + + for(int y = y0; y < ymax; y += 8) + { + float *outptr0 = out + (y * ldout) + x0; + float *outptr1 = outptr0 + ldout; + float *outptr2 = outptr1 + ldout; + float *outptr3 = outptr2 + ldout; + float *outptr4 = outptr3 + ldout; + float *outptr5 = outptr4 + ldout; + + prefetch_2x(outptr0); + prefetch_2x(outptr1); + prefetch_2x(outptr2); + prefetch_2x(outptr3); + prefetch_2x(outptr4); + prefetch_2x(outptr5); + + for(int i = x0; i < xmax; i += 8) + { + float dummyres[8]; + + /* Make sure we throw away results if Y isn't a multiple of 8. + * We do this by pointing the result pointer at a dummy buffer + * we later discard. */ + if((y + 5) >= ymax) + { + switch((y + 5) - ymax) + { + case 4: + outptr1 = dummyres; + case 3: + outptr2 = dummyres; + case 2: + outptr3 = dummyres; + case 1: + outptr4 = dummyres; + case 0: + outptr5 = dummyres; + break; + + default: + UNREACHABLE("Impossible."); + } + } + + /* For ragged X, manually copy over the valid results. */ + if((i + 7) >= xmax) + { + for(int xi = 0; xi < 8; xi++) + { + if((i + xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 8]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 16]) + (*outptr2 * beta); + outptr2++; + *outptr3 = (alpha * inptr[xi + 24]) + (*outptr3 * beta); + outptr3++; + *outptr4 = (alpha * inptr[xi + 32]) + (*outptr4 * beta); + outptr4++; + *outptr5 = (alpha * inptr[xi + 40]) + (*outptr5 * beta); + outptr5++; + } + } + inptr += 48; + } + else + { + /* Optimized routine to copy an entire block */ + __asm __volatile( + // Rows 0-1 + "VLD1.32 {d8-d11}, [%[outptr0]]\n" + "VMUL.f32 q4, q4, %q[bv]\n" + "VLD1.32 {d12-d15}, [%[outptr1]]\n" + "VMUL.f32 q5, q5, %q[bv]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VMUL.f32 q6, q6, %q[bv]\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VMUL.f32 q7, q7, %q[bv]\n" + + "VMLA.f32 q4, q0, %q[av]\n" ASM_PREFETCH("[%[inptr], #352]") + "VMLA.f32 q5, q1, %q[av]\n" + "VST1.32 {d8-d11}, [%[outptr0]]!\n" ASM_PREFETCH("[%[inptr], #416]") "VMLA.f32 q6, q2, %q[av]\n" ASM_PREFETCH("[%[inptr], #480]") + "VMLA.f32 q7, q3, %q[av]\n" + "VST1.32 {d12-d15}, [%[outptr1]]!\n" + + // Rows 2-3 + "VLD1.32 {d8-d11}, [%[outptr2]]\n" + "VMUL.f32 q4, q4, %q[bv]\n" + "VLD1.32 {d12-d15}, [%[outptr3]]\n" + "VMUL.f32 q5, q5, %q[bv]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VMUL.f32 q6, q6, %q[bv]\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VMUL.f32 q7, q7, %q[bv]\n" + + "VMLA.f32 q4, q0, %q[av]\n" ASM_PREFETCH("[%[outptr0], #96]") + "VMLA.f32 q5, q1, %q[av]\n" + "VST1.32 {d8-d11}, [%[outptr2]]!\n" ASM_PREFETCH("[%[outptr1], #96]") "VMLA.f32 q6, q2, %q[av]\n" ASM_PREFETCH("[%[outptr2], #96]") + "VMLA.f32 q7, q3, %q[av]\n" + "VST1.32 {d12-d15}, [%[outptr3]]!\n" + + // Rows 4-5 + "VLD1.32 {d8-d11}, [%[outptr4]]\n" + "VMUL.f32 q4, q4, %q[bv]\n" + "VLD1.32 {d12-d15}, [%[outptr5]]\n" + "VMUL.f32 q5, q5, %q[bv]\n" + "VLD1.32 {d0-d3}, [%[inptr]]!\n" + "VMUL.f32 q6, q6, %q[bv]\n" + "VLD1.32 {d4-d7}, [%[inptr]]!\n" + "VMUL.f32 q7, q7, %q[bv]\n" + + "VMLA.f32 q4, q0, %q[av]\n" ASM_PREFETCH("[%[outptr3], #96]") + "VMLA.f32 q5, q1, %q[av]\n" + "VST1.32 {d8-d11}, [%[outptr4]]!\n" ASM_PREFETCH("[%[outptr4], #96]") "VMLA.f32 q6, q2, %q[av]\n" ASM_PREFETCH("[%[outptr5], #128]") + "VMLA.f32 q7, q3, %q[av]\n" + "VST1.32 {d12-d15}, [%[outptr5]]!\n" + : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3), + [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [inptr] "+r"(inptr) + : [av] "w"(av), [bv] "w"(bv) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + } + } + } +} + +#endif // __arm__ diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp new file mode 100644 index 0000000000..3b59a43c52 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +template <> +inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) +{ + const float *inptr = in; + prefetch_6x(inptr); + prefetch_6x(inptr + 96); + + float32x4_t av = vdupq_n_f32(alpha); + float32x4_t bv = vdupq_n_f32(beta); + + for(int y = y0; y < ymax; y += 8) + { + float *outptr0 = out + (y * ldout) + x0; + float *outptr1 = outptr0 + ldout; + float *outptr2 = outptr1 + ldout; + float *outptr3 = outptr2 + ldout; + float *outptr4 = outptr3 + ldout; + float *outptr5 = outptr4 + ldout; + float *outptr6 = outptr5 + ldout; + float *outptr7 = outptr6 + ldout; + + prefetch_2x(outptr0); + prefetch_2x(outptr1); + prefetch_2x(outptr2); + prefetch_2x(outptr3); + prefetch_2x(outptr4); + prefetch_2x(outptr5); + prefetch_2x(outptr6); + prefetch_2x(outptr7); + + for(int i = x0; i < xmax; i += 12) + { + float dummyres[12]; + + /* Make sure we throw away results if Y isn't a multiple of 8. + * We do this by pointing the result pointer at a dummy buffer + * we later discard. */ + if((y + 7) >= ymax) + { + switch((y + 7) - ymax) + { + case 6: + outptr1 = dummyres; + case 5: + outptr2 = dummyres; + case 4: + outptr3 = dummyres; + case 3: + outptr4 = dummyres; + case 2: + outptr5 = dummyres; + case 1: + outptr6 = dummyres; + case 0: + outptr7 = dummyres; + break; + + default: + UNREACHABLE("Impossible."); + } + } + + /* For ragged X, manually copy over the valid results. */ + if((i + 11) >= xmax) + { + for(int xi = 0; xi < 12; xi++) + { + if((i + xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta); + outptr3++; + *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta); + outptr4++; + *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta); + outptr5++; + *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta); + outptr6++; + *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta); + outptr7++; + } + } + inptr += 96; + } + else + { + /* Optimized routine to copy an entire block */ + __asm __volatile( + // Rows 0-1 + "LDP q16, q17, [%[outptr0]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr0], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr1]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr1], #32]\n" ASM_PREFETCH("[%[inptr], #768]") + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr]]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #32]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #64]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #832]") + "FMLA v17.4s, v1.4s, %[av].4s\n" + "STP q16, q17, [%[outptr0]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr0]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #896]") + "FMLA v20.4s, v4.4s, %[av].4s\n" + "STP q19, q20, [%[outptr1]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr1]], #16\n" + + // Rows 2-3 + "LDP q16, q17, [%[outptr2]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr2], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr3]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr3], #32]\n" ASM_PREFETCH("[%[inptr], #960]") + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #96]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #128]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #160]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #1024]") + "FMLA v17.4s, v1.4s, %[av].4s\n" + "STP q16, q17, [%[outptr2]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr2]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #1088]") + "FMLA v20.4s, v4.4s, %[av].4s\n" + "STP q19, q20, [%[outptr3]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr3]], #16\n" + + // Rows 4-5 + ASM_PREFETCH("[%[outptr0], #80]") + "LDP q16, q17, [%[outptr4]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr4], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr5]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr5], #32]\n" ASM_PREFETCH("[%[outptr1], #80]") + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #192]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #224]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #256]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr2], #80]") + "FMLA v17.4s, v1.4s, %[av].4s\n" + "STP q16, q17, [%[outptr4]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr4]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr3], #80]") + "FMLA v20.4s, v4.4s, %[av].4s\n" + "STP q19, q20, [%[outptr5]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr5]], #16\n" + + // Rows 6-7 + ASM_PREFETCH("[%[outptr4], #80]") + "LDP q16, q17, [%[outptr6]]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDR q18, [%[outptr6], #32]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDP q19, q20, [%[outptr7]]\n" + "FMUL v18.4s, v18.4s, %[bv].4s\n" + "LDR q21, [%[outptr7], #32]\n" ASM_PREFETCH("[%[outptr5], #80]") + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #288]\n" + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "LDP q2, q3, [%[inptr], #320]\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "LDP q4, q5, [%[inptr], #352]\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr6], #128]") + "FMLA v17.4s, v1.4s, %[av].4s\n" + "STP q16, q17, [%[outptr6]], #32\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q18, [%[outptr6]], #16\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr7], #128]") + "FMLA v20.4s, v4.4s, %[av].4s\n" + "STP q19, q20, [%[outptr7]], #32\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "STR q21, [%[outptr7]], #16\n" + "ADD %[inptr], %[inptr], #384\n" + : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3), + [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7), + [inptr] "+r"(inptr) + : [av] "w"(av), [bv] "w"(bv) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21"); + } + } + } +} + +#endif // __aarch64__ \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp new file mode 100644 index 0000000000..12a090112d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) + +#include + +template <> +inline void MergeResults<12, 8>(__fp16 *out, const float *in, int ldout, int y0, int ymax, int x0, int xmax, const __fp16 alpha, const __fp16 beta) +{ + const float *inptr = in; + prefetch_6x(inptr); + prefetch_6x(inptr + 24); + + float32x4_t av = vdupq_n_f32(alpha); + float32x4_t bv = vdupq_n_f32(beta); + + for(int y = y0; y < ymax; y += 8) + { + __fp16 *outptr0 = out + (y * ldout) + x0; + __fp16 *outptr1 = outptr0 + ldout; + __fp16 *outptr2 = outptr1 + ldout; + __fp16 *outptr3 = outptr2 + ldout; + __fp16 *outptr4 = outptr3 + ldout; + __fp16 *outptr5 = outptr4 + ldout; + __fp16 *outptr6 = outptr5 + ldout; + __fp16 *outptr7 = outptr6 + ldout; + + prefetch_2x(outptr0); + prefetch_2x(outptr1); + prefetch_2x(outptr2); + prefetch_2x(outptr3); + prefetch_2x(outptr4); + prefetch_2x(outptr5); + prefetch_2x(outptr6); + prefetch_2x(outptr7); + + for(int i = x0; i < xmax; i += 12) + { + __fp16 dummyres[12]; + + /* Make sure we throw away results if Y isn't a multiple of 8. + * We do this by pointing the result pointer at a dummy buffer + * we later discard. */ + if((y + 7) >= ymax) + { + switch((y + 7) - ymax) + { + case 6: + outptr1 = dummyres; + case 5: + outptr2 = dummyres; + case 4: + outptr3 = dummyres; + case 3: + outptr4 = dummyres; + case 2: + outptr5 = dummyres; + case 1: + outptr6 = dummyres; + case 0: + outptr7 = dummyres; + break; + + default: + UNREACHABLE("Impossible."); + } + } + + /* For ragged X, manually copy over the valid results. */ + if((i + 11) >= xmax) + { + for(int xi = 0; xi < 12; xi++) + { + if((i + xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta); + outptr3++; + *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta); + outptr4++; + *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta); + outptr5++; + *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta); + outptr6++; + *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta); + outptr7++; + } + } + inptr += 96; + } + else + { + /* Optimized routine to copy an entire block */ + __asm __volatile( + // Rows 0-1 + "LDR q16, [%[outptr0]]\n" + "FCVTL2 v17.4s, v16.8h\n" + "LDR d18, [%[outptr0], #16]\n" + "FCVTL v16.4s, v16.4h\n" + "LDR q19, [%[outptr1]]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDR d21, [%[outptr1], #16]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr]]\n" + "FCVTL v18.4s, v18.4h\n" + "LDP q2, q3, [%[inptr], #32]\n" + "FCVTL2 v20.4s, v19.8h\n" + "LDP q4, q5, [%[inptr], #64]\n" + "FCVTL v19.4s, v19.4h\n" ASM_PREFETCH("[%[inptr], #768]") "FCVTL v21.4s, v21.4h\n" ASM_PREFETCH("[%[inptr], #832]") "FMUL v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[inptr], #896]") + "FMUL v20.4s, v20.4s, %[bv].4s\n" ASM_PREFETCH("[%[inptr], #960]") + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + "FMLA v17.4s, v1.4s, %[av].4s\n" + "FCVTN v16.4h, v16.4s\n" + "FCVTN2 v16.8h, v17.4s\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q16, [%[outptr0]], #16\n" + "FCVTN v18.4h, v18.4s\n" + "STR d18, [%[outptr0]], #8\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + "FMLA v20.4s, v4.4s, %[av].4s\n" + "FCVTN v19.4h, v19.4s\n" + "FCVTN2 v19.8h, v20.4s\n" + "STR q19, [%[outptr1]], #16\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "FCVTN v21.4h, v21.4s\n" + "STR d21, [%[outptr1]], #8\n" + + // Rows 2-3 + "LDR q16, [%[outptr2]]\n" + "FCVTL2 v17.4s, v16.8h\n" + "LDR d18, [%[outptr2], #16]\n" + "FCVTL v16.4s, v16.4h\n" + "LDR q19, [%[outptr3]]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDR d21, [%[outptr3], #16]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #96]\n" + "FCVTL v18.4s, v18.4h\n" + "LDP q2, q3, [%[inptr], #128]\n" + "FCVTL2 v20.4s, v19.8h\n" + "LDP q4, q5, [%[inptr], #160]\n" + "FCVTL v19.4s, v19.4h\n" ASM_PREFETCH("[%[inptr], #1024]") "FCVTL v21.4s, v21.4h\n" ASM_PREFETCH("[%[inptr], #1088]") "FMUL v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr0], #64]") + "FMUL v20.4s, v20.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr1], #64]") + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + "FMLA v17.4s, v1.4s, %[av].4s\n" + "FCVTN v16.4h, v16.4s\n" + "FCVTN2 v16.8h, v17.4s\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q16, [%[outptr2]], #16\n" + "FCVTN v18.4h, v18.4s\n" + "STR d18, [%[outptr2]], #8\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + "FMLA v20.4s, v4.4s, %[av].4s\n" + "FCVTN v19.4h, v19.4s\n" + "FCVTN2 v19.8h, v20.4s\n" + "STR q19, [%[outptr3]], #16\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "FCVTN v21.4h, v21.4s\n" + "STR d21, [%[outptr3]], #8\n" + + // Rows 4-5 + "LDR q16, [%[outptr4]]\n" + "FCVTL2 v17.4s, v16.8h\n" + "LDR d18, [%[outptr4], #16]\n" + "FCVTL v16.4s, v16.4h\n" + "LDR q19, [%[outptr5]]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDR d21, [%[outptr5], #16]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #192]\n" + "FCVTL v18.4s, v18.4h\n" + "LDP q2, q3, [%[inptr], #224]\n" + "FCVTL2 v20.4s, v19.8h\n" + "LDP q4, q5, [%[inptr], #256]\n" + "FCVTL v19.4s, v19.4h\n" ASM_PREFETCH("[%[outptr2], #64]") "FCVTL v21.4s, v21.4h\n" ASM_PREFETCH("[%[outptr3], #64]") "FMUL v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr4], #88]") + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + "FMLA v17.4s, v1.4s, %[av].4s\n" + "FCVTN v16.4h, v16.4s\n" + "FCVTN2 v16.8h, v17.4s\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q16, [%[outptr4]], #16\n" + "FCVTN v18.4h, v18.4s\n" + "STR d18, [%[outptr4]], #8\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + "FMLA v20.4s, v4.4s, %[av].4s\n" + "FCVTN v19.4h, v19.4s\n" + "FCVTN2 v19.8h, v20.4s\n" + "STR q19, [%[outptr5]], #16\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "FCVTN v21.4h, v21.4s\n" + "STR d21, [%[outptr5]], #8\n" + + // Rows 6-7 + "LDR q16, [%[outptr6]]\n" + "FCVTL2 v17.4s, v16.8h\n" + "LDR d18, [%[outptr6], #16]\n" + "FCVTL v16.4s, v16.4h\n" + "LDR q19, [%[outptr7]]\n" + "FMUL v17.4s, v17.4s, %[bv].4s\n" + "LDR d21, [%[outptr7], #16]\n" + "FMUL v16.4s, v16.4s, %[bv].4s\n" + "LDP q0, q1, [%[inptr], #288]\n" + "FCVTL v18.4s, v18.4h\n" + "LDP q2, q3, [%[inptr], #320]\n" + "FCVTL2 v20.4s, v19.8h\n" + "LDP q4, q5, [%[inptr], #352]\n" + "FCVTL v19.4s, v19.4h\n" ASM_PREFETCH("[%[outptr5], #64]") "FCVTL v21.4s, v21.4h\n" ASM_PREFETCH("[%[outptr6], #88]") "FMUL v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr7], #88]") + "FMUL v20.4s, v20.4s, %[bv].4s\n" + "FMUL v19.4s, v19.4s, %[bv].4s\n" + "FMUL v21.4s, v21.4s, %[bv].4s\n" + "FMLA v16.4s, v0.4s, %[av].4s\n" + "FMLA v17.4s, v1.4s, %[av].4s\n" + "FCVTN v16.4h, v16.4s\n" + "FCVTN2 v16.8h, v17.4s\n" + "FMLA v18.4s, v2.4s, %[av].4s\n" + "STR q16, [%[outptr6]], #16\n" + "FCVTN v18.4h, v18.4s\n" + "STR d18, [%[outptr6]], #8\n" + "FMLA v19.4s, v3.4s, %[av].4s\n" + "FMLA v20.4s, v4.4s, %[av].4s\n" + "FCVTN v19.4h, v19.4s\n" + "FCVTN2 v19.8h, v20.4s\n" + "STR q19, [%[outptr7]], #16\n" + "FMLA v21.4s, v5.4s, %[av].4s\n" + "FCVTN v21.4h, v21.4s\n" + "STR d21, [%[outptr7]], #8\n" + "ADD %[inptr], %[inptr], #384\n" + : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3), + [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7), + [inptr] "+r"(inptr) + : [av] "w"(av), [bv] "w"(bv) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21"); + } + } + } +} + +#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp new file mode 100644 index 0000000000..08cfc00523 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) + +template <> +inline void MergeResults<24, 8>(__fp16 *out, const __fp16 *in, const int ldout, const int y0, const int ymax, + const int x0, const int xmax, const __fp16 alpha, const __fp16 beta) +{ + const __fp16 *inptr = in; + prefetch_6x(inptr); + prefetch_6x(inptr + 48); + + float16x8_t va = vdupq_n_f16(alpha); + float16x8_t vb = vdupq_n_f16(beta); + + for(int y = y0; y < ymax; y += 8) + { + __fp16 *outptr0 = out + (y * ldout) + x0; + __fp16 *outptr1 = outptr0 + ldout; + __fp16 *outptr2 = outptr1 + ldout; + __fp16 *outptr3 = outptr2 + ldout; + __fp16 *outptr4 = outptr3 + ldout; + __fp16 *outptr5 = outptr4 + ldout; + __fp16 *outptr6 = outptr5 + ldout; + __fp16 *outptr7 = outptr6 + ldout; + + prefetch_2x(outptr0); + prefetch_2x(outptr1); + prefetch_2x(outptr2); + prefetch_2x(outptr3); + prefetch_2x(outptr4); + prefetch_2x(outptr5); + prefetch_2x(outptr6); + prefetch_2x(outptr7); + + for(int i = x0; i < xmax; i += 24) + { + __fp16 dummyres[24]; + + /* Make sure we throw away results if Y isn't a multiple of 8. + * We do this by pointing the result pointer at a dummy buffer + * we later discard. */ + if((y + 7) >= ymax) + { + switch((y + 7) - ymax) + { + case 6: + outptr1 = dummyres; + case 5: + outptr2 = dummyres; + case 4: + outptr3 = dummyres; + case 3: + outptr4 = dummyres; + case 2: + outptr5 = dummyres; + case 1: + outptr6 = dummyres; + case 0: + outptr7 = dummyres; + break; + + default: + UNREACHABLE("Impossible."); + } + } + + /* For ragged X, manually copy over the valid results. */ + if((i + 23) >= xmax) + { + for(int xi = 0; xi < 24; xi++) + { + if((i + xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 24]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 48]) + (*outptr2 * beta); + outptr2++; + *outptr3 = (alpha * inptr[xi + 72]) + (*outptr3 * beta); + outptr3++; + *outptr4 = (alpha * inptr[xi + 96]) + (*outptr4 * beta); + outptr4++; + *outptr5 = (alpha * inptr[xi + 120]) + (*outptr5 * beta); + outptr5++; + *outptr6 = (alpha * inptr[xi + 144]) + (*outptr6 * beta); + outptr6++; + *outptr7 = (alpha * inptr[xi + 168]) + (*outptr7 * beta); + outptr7++; + } + } + inptr += 192; + } + else + { + /* Optimized routine to copy an entire block */ + __asm __volatile( + ".arch armv8.2-a+fp16\n" + // Rows 0-1 + "LDP q16, q17, [%[outptr0]]\n" + "FMUL v16.8h, v16.8h, %[vb].8h\n" + "LDR q18, [%[outptr0], #32]\n" + "FMUL v17.8h, v17.8h, %[vb].8h\n" + "LDP q19, q20, [%[outptr1]]\n" + "FMUL v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[inptr], #768]") + "LDR q21, [%[outptr1], #32]\n" + "FMUL v19.8h, v19.8h, %[vb].8h\n" + "LDP q0, q1, [%[inptr]]\n" + "FMUL v20.8h, v20.8h, %[vb].8h\n" + "LDP q2, q3, [%[inptr], #32]\n" + "FMUL v21.8h, v21.8h, %[vb].8h\n" + "LDP q4, q5, [%[inptr], #64]\n" + "FMLA v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #832]") + "FMLA v17.8h, v1.8h, %[va].8h\n" + "STP q16, q17, [%[outptr0]], #32\n" + "FMLA v18.8h, v2.8h, %[va].8h\n" + "STR q18, [%[outptr0]], #16\n" + "FMLA v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #896]") + "FMLA v20.8h, v4.8h, %[va].8h\n" + "STP q19, q20, [%[outptr1]], #32\n" + "FMLA v21.8h, v5.8h, %[va].8h\n" + "STR q21, [%[outptr1]], #16\n" ASM_PREFETCH("[%[inptr], #960]") + + // Rows 2-3 + "LDP q16, q17, [%[outptr2]]\n" + "FMUL v16.8h, v16.8h, %[vb].8h\n" + "LDR q18, [%[outptr2], #32]\n" + "FMUL v17.8h, v17.8h, %[vb].8h\n" + "LDP q19, q20, [%[outptr3]]\n" + "FMUL v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[inptr], #1024]") + "LDR q21, [%[outptr3], #32]\n" + "FMUL v19.8h, v19.8h, %[vb].8h\n" + "LDP q0, q1, [%[inptr], #96]\n" + "FMUL v20.8h, v20.8h, %[vb].8h\n" + "LDP q2, q3, [%[inptr], #128]\n" + "FMUL v21.8h, v21.8h, %[vb].8h\n" + "LDP q4, q5, [%[inptr], #160]\n" + "FMLA v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #1088]") + "FMLA v17.8h, v1.8h, %[va].8h\n" + "STP q16, q17, [%[outptr2]], #32\n" + "FMLA v18.8h, v2.8h, %[va].8h\n" + "STR q18, [%[outptr2]], #16\n" + "FMLA v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr0], #80]") + "FMLA v20.8h, v4.8h, %[va].8h\n" + "STP q19, q20, [%[outptr3]], #32\n" + "FMLA v21.8h, v5.8h, %[va].8h\n" + "STR q21, [%[outptr3]], #16\n" ASM_PREFETCH("[%[outptr1], #80]") + + // Rows 4-5 + "LDP q16, q17, [%[outptr4]]\n" + "FMUL v16.8h, v16.8h, %[vb].8h\n" + "LDR q18, [%[outptr4], #32]\n" + "FMUL v17.8h, v17.8h, %[vb].8h\n" + "LDP q19, q20, [%[outptr5]]\n" + "FMUL v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[outptr2], #80]") + "LDR q21, [%[outptr5], #32]\n" + "FMUL v19.8h, v19.8h, %[vb].8h\n" + "LDP q0, q1, [%[inptr], #192]\n" + "FMUL v20.8h, v20.8h, %[vb].8h\n" + "LDP q2, q3, [%[inptr], #224]\n" + "FMUL v21.8h, v21.8h, %[vb].8h\n" + "LDP q4, q5, [%[inptr], #256]\n" + "FMLA v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr3], #80]") + "FMLA v17.8h, v1.8h, %[va].8h\n" + "STP q16, q17, [%[outptr4]], #32\n" + "FMLA v18.8h, v2.8h, %[va].8h\n" + "STR q18, [%[outptr4]], #16\n" + "FMLA v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr4], #80]") + "FMLA v20.8h, v4.8h, %[va].8h\n" + "STP q19, q20, [%[outptr5]], #32\n" + "FMLA v21.8h, v5.8h, %[va].8h\n" + "STR q21, [%[outptr5]], #16\n" + + // Rows 6-7 + "LDP q16, q17, [%[outptr6]]\n" + "FMUL v16.8h, v16.8h, %[vb].8h\n" + "LDR q18, [%[outptr6], #32]\n" + "FMUL v17.8h, v17.8h, %[vb].8h\n" + "LDP q19, q20, [%[outptr7]]\n" ASM_PREFETCH("[%[outptr5], #80]") + "FMUL v18.8h, v18.8h, %[vb].8h\n" + "LDR q21, [%[outptr7], #32]\n" + "FMUL v19.8h, v19.8h, %[vb].8h\n" + "LDP q0, q1, [%[inptr], #288]\n" + "FMUL v20.8h, v20.8h, %[vb].8h\n" + "LDP q2, q3, [%[inptr], #320]\n" + "FMUL v21.8h, v21.8h, %[vb].8h\n" + "LDP q4, q5, [%[inptr], #352]\n" + "FMLA v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr6], #128]") + "FMLA v17.8h, v1.8h, %[va].8h\n" + "STP q16, q17, [%[outptr6]], #32\n" + "FMLA v18.8h, v2.8h, %[va].8h\n" + "STR q18, [%[outptr6]], #16\n" + "FMLA v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr7], #128]") + "FMLA v20.8h, v4.8h, %[va].8h\n" + "STP q19, q20, [%[outptr7]], #32\n" + "FMLA v21.8h, v5.8h, %[va].8h\n" + "STR q21, [%[outptr7]], #16\n" + "ADD %[inptr], %[inptr], #384\n" + : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3), + [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7), + [inptr] "+r"(inptr) + : [va] "w"(va), [vb] "w"(vb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21"); + } + } + } +} + +#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp new file mode 100644 index 0000000000..dc247aad37 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +template <> +inline void MergeResults<12, 8>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t alpha, const int32_t beta) +{ + const int32_t *inptr = in; + prefetch_6x(inptr); + prefetch_6x(inptr + 96); + + int32x4_t alpha_value = vdupq_n_s32(alpha); + int32x4_t beta_value = vdupq_n_s32(beta); + + for(int y = y0; y < ymax; y += 8) + { + int32_t *outptr0 = out + (y * ldout) + x0; + int32_t *outptr1 = outptr0 + ldout; + int32_t *outptr2 = outptr1 + ldout; + int32_t *outptr3 = outptr2 + ldout; + int32_t *outptr4 = outptr3 + ldout; + int32_t *outptr5 = outptr4 + ldout; + int32_t *outptr6 = outptr5 + ldout; + int32_t *outptr7 = outptr6 + ldout; + + prefetch_2x(outptr0); + prefetch_2x(outptr1); + prefetch_2x(outptr2); + prefetch_2x(outptr3); + prefetch_2x(outptr4); + prefetch_2x(outptr5); + prefetch_2x(outptr6); + prefetch_2x(outptr7); + + for(int i = x0; i < xmax; i += 12) + { + int32_t dummyres[12]; + + /* Make sure we throw away results if Y isn't a multiple of 8. + * We do this by pointing the result pointer at a dummy buffer + * we later discard. */ + if((y + 7) >= ymax) + { + switch((y + 7) - ymax) + { + case 6: + outptr1 = dummyres; + case 5: + outptr2 = dummyres; + case 4: + outptr3 = dummyres; + case 3: + outptr4 = dummyres; + case 2: + outptr5 = dummyres; + case 1: + outptr6 = dummyres; + case 0: + outptr7 = dummyres; + break; + + default: + UNREACHABLE("Impossible."); + } + } + + /* For ragged X, manually copy over the valid results. */ + if((i + 11) >= xmax) + { + for(int xi = 0; xi < 12; xi++) + { + if((i + xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta); + outptr3++; + *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta); + outptr4++; + *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta); + outptr5++; + *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta); + outptr6++; + *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta); + outptr7++; + } + } + inptr += 96; + } + else + { + /* Optimized routine to copy an entire block */ + __asm __volatile( + // Row 0 + ASM_PREFETCH("[%x[outptr1], #192]") + "ldr q3, [%x[outptr0]]\n" + "ldr q4, [%x[outptr0], #0x10]\n" + "ldr q5, [%x[outptr0], #0x20]\n" + "mul v3.4s, v3.4s, %[alpha_value].4s\n" + "ldr q6, [%x[inptr]]\n" + "mul v4.4s, v4.4s, %[alpha_value].4s\n" + "ldr q7, [%x[inptr], #0x10]\n" + "mul v5.4s, v5.4s, %[alpha_value].4s\n" + "ldr q8, [%x[inptr], #0x20]\n" + "mla v3.4s, v6.4s, %[beta_value].4s\n" + "ldr q0, [%x[outptr1]]\n" + "mla v4.4s, v7.4s, %[beta_value].4s\n" + "ldr q1, [%x[outptr1], #0x10]\n" + "mla v5.4s, v8.4s, %[beta_value].4s\n" + "ldr q2, [%x[outptr1], #0x20]\n" + + // Row 1 + ASM_PREFETCH("[%x[outptr2], #192]") + "mul v0.4s, v0.4s, %[alpha_value].4s\n" + "ldr q6, [%x[inptr], #0x30]\n" + "str q3, [%x[outptr0]], #0x10\n" + "mul v1.4s, v1.4s, %[alpha_value].4s\n" + "ldr q7, [%x[inptr], #0x40]\n" + "str q4, [%x[outptr0]], #0x10\n" + "mul v2.4s, v2.4s, %[alpha_value].4s\n" + "ldr q8, [%x[inptr], #0x50]\n" + "str q5, [%x[outptr0]], #0x10\n" + "mla v0.4s, v6.4s, %[beta_value].4s\n" + "ldr q3, [%x[outptr2]]\n" + "mla v1.4s, v7.4s, %[beta_value].4s\n" + "ldr q4, [%x[outptr2], #0x10]\n" + "mla v2.4s, v8.4s, %[beta_value].4s\n" + "ldr q5, [%x[outptr2], #0x20]\n" + + // Row 2 + ASM_PREFETCH("[%x[outptr3], #192]") + "mul v3.4s, v3.4s, %[alpha_value].4s\n" + "ldr q6, [%x[inptr], #0x60]\n" + "str q0, [%x[outptr1]], #0x10\n" + "mul v4.4s, v4.4s, %[alpha_value].4s\n" + "ldr q7, [%x[inptr], #0x70]\n" + "str q1, [%x[outptr1]], #0x10\n" + "mul v5.4s, v5.4s, %[alpha_value].4s\n" + "ldr q8, [%x[inptr], #0x80]\n" + "str q2, [%x[outptr1]], #0x10\n" + "mla v3.4s, v6.4s, %[beta_value].4s\n" + "ldr q0, [%x[outptr3]]\n" + "mla v4.4s, v7.4s, %[beta_value].4s\n" + "ldr q1, [%x[outptr3], #0x10]\n" + "mla v5.4s, v8.4s, %[beta_value].4s\n" + "ldr q2, [%x[outptr3], #0x20]\n" + + // Row 3 + ASM_PREFETCH("[%x[outptr4], #192]") + "mul v0.4s, v0.4s, %[alpha_value].4s\n" + "ldr q6, [%x[inptr], #0x90]\n" + "str q3, [%x[outptr2]], #0x10\n" + "mul v1.4s, v1.4s, %[alpha_value].4s\n" + "ldr q7, [%x[inptr], #0xa0]\n" + "str q4, [%x[outptr2]], #0x10\n" + "mul v2.4s, v2.4s, %[alpha_value].4s\n" + "ldr q8, [%x[inptr], #0xb0]\n" + "str q5, [%x[outptr2]], #0x10\n" + "mla v0.4s, v6.4s, %[beta_value].4s\n" + "ldr q3, [%x[outptr4]]\n" + "mla v1.4s, v7.4s, %[beta_value].4s\n" + "ldr q4, [%x[outptr4], #0x10]\n" + "mla v2.4s, v8.4s, %[beta_value].4s\n" + "ldr q5, [%x[outptr4], #0x20]\n" + + // Row 4 + ASM_PREFETCH("[%x[outptr5], #192]") + "mul v3.4s, v3.4s, %[alpha_value].4s\n" + "ldr q6, [%x[inptr], #0xc0]\n" + "str q0, [%x[outptr3]], #0x10\n" + "mul v4.4s, v4.4s, %[alpha_value].4s\n" + "ldr q7, [%x[inptr], #0xd0]\n" + "str q1, [%x[outptr3]], #0x10\n" + "mul v5.4s, v5.4s, %[alpha_value].4s\n" + "ldr q8, [%x[inptr], #0xe0]\n" + "str q2, [%x[outptr3]], #0x10\n" + "mla v3.4s, v6.4s, %[beta_value].4s\n" + "ldr q0, [%x[outptr5]]\n" + "mla v4.4s, v7.4s, %[beta_value].4s\n" + "ldr q1, [%x[outptr5], #0x10]\n" + "mla v5.4s, v8.4s, %[beta_value].4s\n" + "ldr q2, [%x[outptr5], #0x20]\n" + + // Row 5 + ASM_PREFETCH("[%x[outptr6], #192]") + "mul v0.4s, v0.4s, %[alpha_value].4s\n" + "ldr q6, [%x[inptr], #0xf0]\n" + "str q3, [%x[outptr4]], #0x10\n" + "mul v1.4s, v1.4s, %[alpha_value].4s\n" + "ldr q7, [%x[inptr], #0x100]\n" + "str q4, [%x[outptr4]], #0x10\n" + "mul v2.4s, v2.4s, %[alpha_value].4s\n" + "ldr q8, [%x[inptr], #0x110]\n" + "str q5, [%x[outptr4]], #0x10\n" + "mla v0.4s, v6.4s, %[beta_value].4s\n" + "ldr q3, [%x[outptr6]]\n" + "mla v1.4s, v7.4s, %[beta_value].4s\n" + "ldr q4, [%x[outptr6], #0x10]\n" + "mla v2.4s, v8.4s, %[beta_value].4s\n" + "ldr q5, [%x[outptr6], #0x20]\n" + + // Row 6 + ASM_PREFETCH("[%x[outptr7], #192]") + "mul v3.4s, v3.4s, %[alpha_value].4s\n" + "ldr q6, [%x[inptr], #0x120]\n" + "str q0, [%x[outptr5]], #0x10\n" + "mul v4.4s, v4.4s, %[alpha_value].4s\n" + "ldr q7, [%x[inptr], #0x130]\n" + "str q1, [%x[outptr5]], #0x10\n" + "mul v5.4s, v5.4s, %[alpha_value].4s\n" + "ldr q8, [%x[inptr], #0x140]\n" + "str q2, [%x[outptr5]], #0x10\n" + "mla v3.4s, v6.4s, %[beta_value].4s\n" + "ldr q0, [%x[outptr7]]\n" + "mla v4.4s, v7.4s, %[beta_value].4s\n" + "ldr q1, [%x[outptr7], #0x10]\n" + "mla v5.4s, v8.4s, %[beta_value].4s\n" + "ldr q2, [%x[outptr7], #0x20]\n" + + // Row 7 + "mul v0.4s, v0.4s, %[alpha_value].4s\n" + "ldr q6, [%x[inptr], #0x150]\n" + "str q3, [%x[outptr6]], #0x10\n" + "mul v1.4s, v1.4s, %[alpha_value].4s\n" + "ldr q7, [%x[inptr], #0x160]\n" + "str q4, [%x[outptr6]], #0x10\n" + "mul v2.4s, v2.4s, %[alpha_value].4s\n" + "ldr q8, [%x[inptr], #0x170]\n" + "str q5, [%x[outptr6]], #0x10\n" + "mla v0.4s, v6.4s, %[beta_value].4s\n" + "mla v1.4s, v7.4s, %[beta_value].4s\n" + "mla v2.4s, v8.4s, %[beta_value].4s\n" + "str q0, [%x[outptr7]], #0x10\n" + "str q1, [%x[outptr7]], #0x10\n" + "str q2, [%x[outptr7]], #0x10\n" + + "add %x[inptr], %x[inptr], #0x180\n" + : [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [outptr4] "+r"(outptr4), + [outptr5] "+r"(outptr5), + [outptr6] "+r"(outptr6), + [outptr7] "+r"(outptr7), + [inptr] "+r"(inptr) + : [alpha_value] "w"(alpha_value), + [beta_value] "w"(beta_value) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); + } + } + } +} + +template <> +inline void MergeResults<12, 8>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t alpha, const uint32_t beta) +{ + // Since the above code uses only MUL and MLA instructions discard the "unsignedness" and proceed safely. + MergeResults<12, 8>(reinterpret_cast(out), reinterpret_cast(in), ldout, y0, ymax, x0, xmax, static_cast(alpha), static_cast(beta)); +} + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/merges/list.hpp b/src/core/NEON/kernels/arm_gemm/merges/list.hpp new file mode 100644 index 0000000000..7d56e58f44 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/merges/list.hpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "a32_merge_float_8x6.hpp" +#include "a64_merge_float_12x8.hpp" +#include "a64_merge_float_to_half_12x8.hpp" +#include "a64_merge_half_24x8.hpp" +#include "a64_merge_int32_12x8.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/misc.cpp b/src/core/NEON/kernels/arm_gemm/misc.cpp new file mode 100644 index 0000000000..b29cc58d5d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/misc.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +unsigned int get_cpu_impl() +{ +#ifndef BARE_METAL + int fd = open("/proc/cpuinfo", 0); + char buff[3000]; + char *pos; + char *end; + int foundid = 0; + int variant = 0; + + int cpu = sched_getcpu(); + + if(!fd) + { + return 0; + } + + int charsread = read(fd, buff, 3000); + pos = buff; + end = buff + charsread; + + close(fd); + + /* So, to date I've encountered two formats for /proc/cpuinfo. + * + * One of them just lists processor : n for each processor (with no + * other info), then at the end lists part information for the current + * CPU. + * + * The other has an entire clause (including part number info) for each + * CPU in the system, with "processor : n" headers. + * + * We can cope with either of these formats by waiting to see + * "processor: n" (where n = our CPU ID), and then looking for the next + * "CPU part" field. + */ + while(pos < end) + { + if(foundid && !strncmp(pos, "CPU variant", 11)) + { + pos += 13; + char *resume = end; // Need to continue scanning after this + + for(char *ch = pos; ch < end; ch++) + { + if(*ch == '\n') + { + *ch = '\0'; + resume = ch + 1; + break; + } + } + + variant = strtoul(pos, NULL, 0); + + pos = resume; + } + + if(foundid && !strncmp(pos, "CPU part", 8)) + { + /* Found part number */ + pos += 11; + unsigned int num; + + for(char *ch = pos; ch < end; ch++) + { + if(*ch == '\n') + { + *ch = '\0'; + break; + } + } + + num = strtoul(pos, NULL, 0); + + return (num << 4) | (variant << 20); + } + + if(!strncmp(pos, "processor", 9)) + { + /* Found processor ID, see if it's ours. */ + pos += 11; + int num; + + for(char *ch = pos; ch < end; ch++) + { + if(*ch == '\n') + { + *ch = '\0'; + break; + } + } + + num = strtol(pos, NULL, 0); + + if(num == cpu) + { + foundid = 1; + } + } + + while(pos < end) + { + char ch = *pos++; + if(ch == '\n' || ch == '\0') + { + break; + } + } + } +#endif + + return 0; +} + +CPUInfo *get_CPUInfo() +{ + static CPUInfo ci; + + return &ci; +} diff --git a/src/core/NEON/kernels/arm_gemm/profiler.hpp b/src/core/NEON/kernels/arm_gemm/profiler.hpp new file mode 100644 index 0000000000..c38b0a443c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/profiler.hpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef CYCLE_PROFILING + +#include "../perf.h" + +#ifndef NO_MULTI_THREADING +#include +#endif + +namespace arm_gemm +{ +#ifndef NO_MULTI_THREADING +extern std::mutex report_mutex; +#endif + +class profiler +{ +private: + static const int maxevents = 100000; + unsigned long times[maxevents] = {}; + unsigned long units[maxevents] = {}; + int events[maxevents] = {}; + int currentevent = 0; + int countfd = 0; + +public: + profiler() + { + countfd = open_cycle_counter(); + } + + ~profiler() + { + close(countfd); + int tots[5]; + unsigned long counts[5]; + unsigned long tunits[5]; + const char *descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" }; + + for(int i = 1; i < 5; i++) + { + tots[i] = 0; + counts[i] = 0; + tunits[i] = 0; + } + + for(int i = 0; i < currentevent; i++) + { + // printf("%10s: %ld\n", descs[events[i]-1], times[i]); + tots[events[i]]++; + counts[events[i]] += times[i]; + tunits[events[i]] += units[i]; + } + +#ifdef NO_MULTI_THREADING + printf("Profiled events:\n"); +#else + std::lock_guard lock(report_mutex); + printf("Profiled events (cpu %d):\n", sched_getcpu()); +#endif + + printf("%20s %9s %9s %9s %12s %9s\n", "", "Events", "Total", "Average", "Bytes/MACs", "Per cycle"); + for(int i = 1; i < 5; i++) + { + printf("%20s: %9d %9ld %9ld %12lu %9.2f\n", descs[i - 1], tots[i], counts[i], counts[i] / tots[i], tunits[i], (float)tunits[i] / counts[i]); + } + } + + template + void operator()(int i, unsigned long u, T func) + { + if(currentevent == maxevents) + { + func(); + } + else + { + events[currentevent] = i; + units[currentevent] = u; + start_counter(countfd); + func(); + long long cycs = stop_counter(countfd); + times[currentevent++] = cycs; + } + } +}; + +#else + +namespace arm_gemm +{ +class profiler +{ +public: + template + void operator()(int i, unsigned long u, T func) + { + func(); + } +}; + +#endif // CYCLE_PROFILING + +} // namespace arm_gemm + +#define PROFILE_PREPA 1 +#define PROFILE_PREPB 2 +#define PROFILE_KERNEL 3 +#define PROFILE_MERGE 4 diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp new file mode 100644 index 0000000000..c80bb59941 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transform.hpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +/* + * Generic transform. + * + * Assuming the untransposed case, this works by first reading + * consecutive values from the first input row. This same number of values + * are then read from the next rows. Now return to the first + * input row and repeat. + * + * Need to cope with the work requested in either dimension not actually + * being a multiple of the block sizes. + */ +template +struct TransformImpl +{ + template + static void Transform(TOut *out, const TIn *const in, const int stride, + const int y0, const int ymax, const int x0, const int xmax) + { + const int n_whole_y_blocks = (ymax - y0) / IntBy; + const int y_remainders = (ymax - y0) % IntBy; + const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0); + + const int n_whole_x_blocks = (xmax - x0) / BlockBy; + const int x_remainders = (xmax - x0) % BlockBy; + const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0); + + // "Y" loop: advance down the rows of the source IntBy rows at a time. + // Set up fill_rows to show the number rows to copy from, and blank_rows + // for the number of blank rows to add. + for(int y_block = 0; y_block < n_y_blocks; y_block++) + { + int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders; + int blank_rows = IntBy - fill_rows; + + int y_base = y0 + (y_block * IntBy); + + // So now advance along this block of rows, BlockBy columns at a time. + for(int x_block = 0; x_block < n_x_blocks; x_block++) + { + int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders; + int blank_cols = BlockBy - fill_cols; + + int x_base = x0 + (x_block * BlockBy); + + for(int row = 0; row < fill_rows; row++) + { + for(int col = 0; col < fill_cols; col++) + { + // In-range copy. If it's transposed, we reverse the sense of rows and columns here. + if(Transposed) + { + *out++ = static_cast(in[(x_base + col) * stride + y_base + row]); + } + else + { + *out++ = static_cast(in[(y_base + row) * stride + x_base + col]); + } + } + // "col" tail - row is in range but column is out of range. + for(int col = 0; col < blank_cols; col++) + { + *out++ = static_cast(0); + } + } + // "row" tail - row is out of range so fill with zeros always. + for(int row = 0; row < blank_rows; row++) + { + for(int col = 0; col < (fill_cols + blank_cols); col++) + { + *out++ = static_cast(0); + } + } + } + } + } + + template + static inline void Transform(T *out, const T *const in, const int stride, + const int k0, const int kmax, const int x0, const int xmax) + { + Transform(out, in, stride, k0, kmax, x0, xmax); + } +}; + +/*****************************************************************************/ +template +void Transform( + TOut *out, const TIn *const in, const int stride, + const int k0, const int kmax, const int x0, const int xmax) +{ + // Redirect to a specialised implementation predicated on argument size. + TransformImpl::Transform( + out, in, stride, k0, kmax, x0, xmax); +} +/*****************************************************************************/ + +#include "transforms/list.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp new file mode 100644 index 0000000000..f09e5a0e78 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __arm__ + +#include + +#include "../asmlib.hpp" + +template <> +template +inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) +{ + uint32_t *outptr = reinterpret_cast(out); + const uint32_t *inptr = reinterpret_cast(in); + + uint32_t zerobuff[8]; + + for(int y = y0; y < ymax; y += 6) + { + const uint32_t *inptr0 = inptr + y * ldin + k0; + const uint32_t *inptr1 = inptr0 + ldin; + const uint32_t *inptr2 = inptr1 + ldin; + const uint32_t *inptr3 = inptr2 + ldin; + const uint32_t *inptr4 = inptr3 + ldin; + const uint32_t *inptr5 = inptr4 + ldin; + + //prefetch_2x(inptr0); + //prefetch_2x(inptr1); + //prefetch_2x(inptr2); + //prefetch_2x(inptr3); + //prefetch_2x(inptr4); + //prefetch_2x(inptr5); + + int x = (kmax - k0); + for(; x > 7; x -= 8) + { + /* Cope with ragged cases by copying from a buffer of zeroes instead */ + if((y + 5) >= ymax) + { + switch((y + 5) - ymax) + { + /* Everything falls through in here */ + case 4: + inptr1 = zerobuff; + case 3: + inptr2 = zerobuff; + case 2: + inptr3 = zerobuff; + case 1: + inptr4 = zerobuff; + case 0: + inptr5 = zerobuff; + default: + break; + } + } + + __asm __volatile( + // Load up 8 elements (2 vectors) from each of 8 sources. + "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3 + "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3 + "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3 + "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3 + "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3 + "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3 + "VLD1.32 {d16-d19}, [%[inptr4]]!\n" + "VLD1.32 {d20-d23}, [%[inptr5]]!\n" + "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3 + ASM_PREFETCH("[%[inptr0], #128]") + "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1 + + // Store first elements + "VST1.32 {d0-d1}, [%[outptr]]!\n" + "VST1.32 {d16}, [%[outptr]]!\n" + + "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3 + + // Store second elements + "VST1.32 {d4-d5}, [%[outptr]]!\n" + "VZIP.32 q1, q5\n" ASM_PREFETCH("[%[inptr1], #128]") + "VST1.32 {d17}, [%[outptr]]!\n" + "VZIP.32 q3, q7\n" + + // Store third elements + "VZIP.32 q9, q11\n" + "VST1.32 {d8-d9}, [%[outptr]]!\n" + "VZIP.32 q1, q3\n" ASM_PREFETCH("[%[inptr2], #128]") + "VST1.32 {d20}, [%[outptr]]!\n" + + // Store fourth elements + "VZIP.32 q5, q7\n" + "VST1.32 {d12-d13}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr3], #128]") + "VST1.32 {d21}, [%[outptr]]!\n" + + // Fifth + "VST1.32 {d2-d3}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr4], #128]") + "VST1.32 {d18}, [%[outptr]]!\n" + + // Sixth + "VST1.32 {d6-d7}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr5], #128]") + "VST1.32 {d19}, [%[outptr]]!\n" + + // Seventh + "VST1.32 {d10-d11}, [%[outptr]]!\n" + "VST1.32 {d22}, [%[outptr]]!\n" + + // Eighth + "VST1.32 {d14-d15}, [%[outptr]]!\n" + "VST1.32 {d23}, [%[outptr]]!\n" + + : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3), + [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [outptr] "+r"(outptr) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12"); + } + + for(; x > 0; x--) + { + *outptr++ = *inptr0++; + *outptr++ = *inptr1++; + *outptr++ = *inptr2++; + *outptr++ = *inptr3++; + *outptr++ = *inptr4++; + *outptr++ = *inptr5++; + } + } +} + +#endif // __arm__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp new file mode 100644 index 0000000000..ea32c9665c --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __arm__ + +#include "transpose_interleave_common.hpp" + +// Generic unblocked transposed 8x32-bit sized specialisation +template <> +template +inline void TransformImpl<8, 1, true, 4, 4>::Transform( + T *out, const T *const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax) +{ + // Redirect to a 16x uint16_t specialisation + TransformImpl<16, 1, true, 2, 2>::Transform( + reinterpret_cast(out), + reinterpret_cast(in), + stride * 2, x0 * 2, xmax * 2, k0, kmax); +} + +// Generic 12x16-bit sized specialisation +template <> +template +inline void TransformImpl<16, 1, true, 2, 2>::Transform( + T *out, const T *const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax) +{ + // Redirect to a uint16_t specialisation + Transform( + reinterpret_cast(out), + reinterpret_cast(in), + stride, x0, xmax, k0, kmax); +} + +// Specialised 16 x uint16_t version +template <> +inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) +{ + __asm volatile( + "VLD1.32 {d0-d3}, [%[in0]]!\n" + "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]") + : [in0] "+r"(in0), + [out] "+r"(out) + : + : "q0", "q1", "memory"); +} + +template <> +inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) +{ + __asm volatile( + "VLD1.32 {d0-d3}, [%[in0]]!\n" + "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]") + "VLD1.32 {d0-d3}, [%[in1]]!\n" + "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in1], #192]") "SUB %[out], %[out], #32\n" + : [in0] "+r"(in0), + [in1] "+r"(in1), + [out] "+r"(out) + : + : "q0", "q1", "memory"); +} + +template <> +inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) +{ + __asm __volatile( + "VLD1.32 {d0-d3}, [%[in0]]!\n" + "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]") + "VLD1.32 {d0-d3}, [%[in1]]!\n" + "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in1], #192]") + "VLD1.32 {d0-d3}, [%[in2]]!\n" + "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in2], #192]") + "VLD1.32 {d0-d3}, [%[in3]]!\n" + "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in3], #192]") "SUB %[out], %[out], #96\n" + : [in0] "+r"(in0), + [in1] "+r"(in1), + [in2] "+r"(in2), + [in3] "+r"(in3), + [out] "+r"(out) + : + : "q0", "q1", "memory"); +} + +template <> +template <> +inline void TransformImpl<16, 1, true, 2, 2>::Transform( + uint16_t *out, const uint16_t *const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax) +{ + TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); +} + +#endif // __arm__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp new file mode 100644 index 0000000000..8d61f15cec --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include + +#include "../asmlib.hpp" +#include "../utils.hpp" + +template <> +template +void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) +{ + uint8_t *outptr = (uint8_t *)out; + const uint8_t *inptr = (uint8_t *)in; + + uint8_t zerobuff[16]; + + for(int y = y0; y < ymax; y += 4) + { + const uint8_t *inptr0 = inptr + y * ldin + k0; + const uint8_t *inptr1 = inptr0 + ldin; + const uint8_t *inptr2 = inptr1 + ldin; + const uint8_t *inptr3 = inptr2 + ldin; + + prefetch_2x(inptr0); + prefetch_2x(inptr1); + prefetch_2x(inptr2); + prefetch_2x(inptr3); + + int x = (kmax - k0); + for(; x > 15; x -= 16) + { + /* Cope with ragged cases by copying from a buffer of zeroes instead */ + if((y + 3) >= ymax) + { + switch((y + 3) - ymax) + { + /* Everything falls through in here */ + case 2: + inptr1 = zerobuff; + case 1: + inptr2 = zerobuff; + case 0: + inptr3 = zerobuff; + break; + + default: + UNREACHABLE("Impossible."); + } + } + + __asm __volatile( + "LDR q0, [%[inptr0]], #16\n" ASM_PREFETCH("[%[inptr0], #176]") "LDR q1, [%[inptr1]], #16\n" ASM_PREFETCH("[%[inptr1], #176]") + "STP q0, q1, [%[outptr]], #32\n" + "LDR q0, [%[inptr2]], #16\n" ASM_PREFETCH("[%[inptr2], #176]") "LDR q1, [%[inptr3]], #16\n" ASM_PREFETCH("[%[inptr3], #176]") "STP q0, q1, [%[outptr]], #32\n" + : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3), + [outptr] "+r"(outptr) + : + : "v0", "v1"); + } + + if(x > 0) + { + /* Need to duplicate this here, in case we didn't run the main loop. */ + if((y + 3) >= ymax) + { + switch((y + 3) - ymax) + { + /* Everything falls through in here */ + case 2: + inptr1 = zerobuff; + case 1: + inptr2 = zerobuff; + case 0: + inptr3 = zerobuff; + break; + + default: + UNREACHABLE("Impossible."); + } + } + + /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */ + auto f = [&outptr, x](const uint8_t *&p) + { + for(int i = 0; i < 16; i++) + { + if(i < x) + { + *outptr++ = *p++; + } + else + { + *outptr++ = 0; + } + } + }; + + f(inptr0); + f(inptr1); + f(inptr2); + f(inptr3); + } + } +} + +#endif // __aarch64__ \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp new file mode 100644 index 0000000000..3cbc8815e3 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include + +#include "../asmlib.hpp" + +template <> +template +void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) +{ + uint16_t *outptr = (uint16_t *)out; + const uint16_t *inptr = (const uint16_t *)in; + + uint16_t zerobuff[24]; + + for(int y = y0; y < ymax; y += 8) + { + const uint16_t *inptr0 = inptr + y * ldin + k0; + const uint16_t *inptr1 = inptr0 + ldin; + const uint16_t *inptr2 = inptr1 + ldin; + const uint16_t *inptr3 = inptr2 + ldin; + const uint16_t *inptr4 = inptr3 + ldin; + const uint16_t *inptr5 = inptr4 + ldin; + const uint16_t *inptr6 = inptr5 + ldin; + const uint16_t *inptr7 = inptr6 + ldin; + + prefetch_2x(inptr0); + prefetch_2x(inptr1); + prefetch_2x(inptr2); + prefetch_2x(inptr3); + prefetch_2x(inptr4); + prefetch_2x(inptr5); + prefetch_2x(inptr6); + prefetch_2x(inptr7); + + int x = (kmax - k0); + for(; x > 7; x -= 8) + { + /* Cope with ragged cases by copying from a buffer of zeroes instead */ + if((y + 7) >= ymax) + { + switch((y + 7) - ymax) + { + /* Everything falls through in here */ + case 6: + inptr1 = zerobuff; + case 5: + inptr2 = zerobuff; + case 4: + inptr3 = zerobuff; + case 3: + inptr4 = zerobuff; + case 2: + inptr5 = zerobuff; + case 1: + inptr6 = zerobuff; + case 0: + inptr7 = zerobuff; + break; + + default: + UNREACHABLE("Impossible."); + } + } + + int skippf = (x & 31); + __asm __volatile( + // Load up 8 elements (1 vector) from each of 8 sources. + "CBNZ %w[skippf], 1f\n" ASM_PREFETCH("[%[inptr0], #128]") + ASM_PREFETCH("[%[inptr1], #128]") + ASM_PREFETCH("[%[inptr2], #128]") + ASM_PREFETCH("[%[inptr3], #128]") + "1:\n" + + "LDR q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7 + "LDR q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7 + "LDR q2, [%[inptr2]], #16\n" // q4=C0C1C2C3... + "LDR q6, [%[inptr6]], #16\n" + "ZIP1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3 + "ZIP2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7 + "ZIP1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3 + "ZIP2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7 + "LDR q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7 + "LDR q5, [%[inptr5]], #16\n" + "LDR q3, [%[inptr3]], #16\n" // q3=D0D1D2D3.... + "LDR q7, [%[inptr7]], #16\n" + "ZIP1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3 + "ZIP2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7 + "ZIP1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3 + "ZIP2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7 + + "ZIP1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1 + "ZIP2 v20.8h, v8.8h, v9.8h\n" + "ZIP1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1 + "ZIP2 v21.8h, v10.8h, v11.8h\n" + + "CBNZ %w[skippf], 2f\n" ASM_PREFETCH("[%[inptr4], #112]") + ASM_PREFETCH("[%[inptr5], #112]") + ASM_PREFETCH("[%[inptr6], #112]") + ASM_PREFETCH("[%[inptr7], #112]") + "2:\n" + + "ZIP1 v22.8h, v16.8h, v17.8h\n" + "ZIP2 v30.8h, v16.8h, v17.8h\n" + "ZIP1 v23.8h, v18.8h, v19.8h\n" + "ZIP2 v31.8h, v18.8h, v19.8h\n" + + "ZIP1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0 + "ZIP2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1 + "STP q14, q15, [%[outptr]], #32\n" // Write back first two elements + + "ZIP1 v0.8h, v20.8h, v21.8h\n" + "ZIP2 v1.8h, v20.8h, v21.8h\n" + "STP q0, q1, [%[outptr]], #32\n" // Write back next two elements + + "ZIP1 v2.8h, v22.8h, v23.8h\n" + "ZIP2 v3.8h, v22.8h, v23.8h\n" + "STP q2, q3, [%[outptr]], #32\n" // Write back next two elements + + "ZIP1 v4.8h, v30.8h, v31.8h\n" + "ZIP2 v5.8h, v30.8h, v31.8h\n" + "STP q4, q5, [%[outptr]], #32\n" // Write back last two elements + : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3), + [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr) + : [skippf] "r"(skippf) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + } + + for(; x > 0; x--) + { + *outptr++ = *inptr0++; + *outptr++ = *inptr1++; + *outptr++ = *inptr2++; + *outptr++ = *inptr3++; + *outptr++ = *inptr4++; + *outptr++ = *inptr5++; + *outptr++ = *inptr6++; + *outptr++ = *inptr7++; + } + } +} + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp new file mode 100644 index 0000000000..47e4fa2608 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include + +#include "../asmlib.hpp" + +template <> +template +inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) +{ + uint32_t *outptr = (uint32_t *)out; + const uint32_t *inptr = (uint32_t *)in; + + uint32_t zerobuff[8]; + + for(int y = y0; y < ymax; y += 8) + { + const uint32_t *inptr0 = inptr + y * ldin + k0; + const uint32_t *inptr1 = inptr0 + ldin; + const uint32_t *inptr2 = inptr1 + ldin; + const uint32_t *inptr3 = inptr2 + ldin; + const uint32_t *inptr4 = inptr3 + ldin; + const uint32_t *inptr5 = inptr4 + ldin; + const uint32_t *inptr6 = inptr5 + ldin; + const uint32_t *inptr7 = inptr6 + ldin; + + prefetch_2x(inptr0); + prefetch_2x(inptr1); + prefetch_2x(inptr2); + prefetch_2x(inptr3); + prefetch_2x(inptr4); + prefetch_2x(inptr5); + prefetch_2x(inptr6); + prefetch_2x(inptr7); + + int x = (kmax - k0); + for(; x > 7; x -= 8) + { + /* Cope with ragged cases by copying from a buffer of zeroes instead */ + if((y + 7) >= ymax) + { + switch((y + 7) - ymax) + { + /* Everything falls through in here */ + case 6: + inptr1 = zerobuff; + case 5: + inptr2 = zerobuff; + case 4: + inptr3 = zerobuff; + case 3: + inptr4 = zerobuff; + case 2: + inptr5 = zerobuff; + case 1: + inptr6 = zerobuff; + case 0: + inptr7 = zerobuff; + break; + + default: + UNREACHABLE("Impossible."); + } + } + + __asm __volatile( + // Load up 8 elements (2 vectors) from each of 8 sources. + "LDP q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3 + "LDP q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3 + "LDP q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3 + "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 + ASM_PREFETCH("[%[inptr0], #128]") + "LDP q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3 + "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 + "LDP q8, q9, [%[inptr4]], #32\n" + "LDP q10, q11, [%[inptr5]], #32\n" + "LDP q12, q13, [%[inptr6]], #32\n" + "ZIP1 v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr1], #128]") + "LDP q14, q15, [%[inptr7]], #32\n" + "ZIP1 v19.4s, v10.4s, v14.4s\n" + + "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0 + ASM_PREFETCH("[%[inptr2], #128]") + "ZIP1 v21.4s, v18.4s, v19.4s\n" + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" + + "ZIP2 v16.4s, v0.4s, v4.4s\n" ASM_PREFETCH("[%[inptr3], #128]") + "ZIP2 v17.4s, v2.4s, v6.4s\n" + "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source + + "ZIP2 v18.4s, v8.4s, v12.4s\n" + "ZIP2 v19.4s, v10.4s, v14.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source + + "ZIP1 v20.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr4], #128]") + "ZIP1 v21.4s, v18.4s, v19.4s\n" + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" + + "ZIP1 v16.4s, v1.4s, v5.4s\n" ASM_PREFETCH("[%[inptr5], #128]") + "ZIP1 v17.4s, v3.4s, v7.4s\n" + "STP q20, q21, [%[outptr]], #32\n" // Third element + + "ZIP1 v18.4s, v9.4s, v13.4s\n" + "ZIP1 v19.4s, v11.4s, v15.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Fourth element + + "ZIP1 v20.4s, v16.4s, v17.4s\n" + "ZIP1 v21.4s, v18.4s, v19.4s\n" + "ZIP2 v22.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr6], #128]") + "ZIP2 v23.4s, v18.4s, v19.4s\n" + + "ZIP2 v16.4s, v1.4s, v5.4s\n" + "ZIP2 v17.4s, v3.4s, v7.4s\n" + "STP q20, q21, [%[outptr]], #32\n" // Fifth element + + "ZIP2 v18.4s, v9.4s, v13.4s\n" ASM_PREFETCH("[%[inptr7], #128]") + "ZIP2 v19.4s, v11.4s, v15.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Sixth element + + "ZIP1 v20.4s, v16.4s, v17.4s\n" + "ZIP1 v21.4s, v18.4s, v19.4s\n" + "STP q20, q21, [%[outptr]], #32\n" // Seventh element + + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Eighth element + : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3), + [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); + } + + for(; x > 0; x--) + { + *outptr++ = *inptr0++; + *outptr++ = *inptr1++; + *outptr++ = *inptr2++; + *outptr++ = *inptr3++; + *outptr++ = *inptr4++; + *outptr++ = *inptr5++; + *outptr++ = *inptr6++; + *outptr++ = *inptr7++; + } + } +} + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp new file mode 100644 index 0000000000..85ffdc2d4f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) + +#include + +#include "../asmlib.hpp" + +template <> +template <> +inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax) +{ + float *outptr = out; + const __fp16 *inptr = in; + + __fp16 zerobuff[8]; + + for(int y = y0; y < ymax; y += 8) + { + const __fp16 *inptr0 = inptr + y * ldin + k0; + const __fp16 *inptr1 = inptr0 + ldin; + const __fp16 *inptr2 = inptr1 + ldin; + const __fp16 *inptr3 = inptr2 + ldin; + const __fp16 *inptr4 = inptr3 + ldin; + const __fp16 *inptr5 = inptr4 + ldin; + const __fp16 *inptr6 = inptr5 + ldin; + const __fp16 *inptr7 = inptr6 + ldin; + + prefetch_2x(inptr0); + prefetch_2x(inptr1); + prefetch_2x(inptr2); + prefetch_2x(inptr3); + prefetch_2x(inptr4); + prefetch_2x(inptr5); + prefetch_2x(inptr6); + prefetch_2x(inptr7); + + int x = (kmax - k0); + for(; x > 7; x -= 8) + { + /* Cope with ragged cases by copying from a buffer of zeroes instead */ + if((y + 7) >= ymax) + { + switch((y + 7) - ymax) + { + /* Everything falls through in here */ + case 6: + inptr1 = zerobuff; + case 5: + inptr2 = zerobuff; + case 4: + inptr3 = zerobuff; + case 3: + inptr4 = zerobuff; + case 2: + inptr5 = zerobuff; + case 1: + inptr6 = zerobuff; + case 0: + inptr7 = zerobuff; + break; + + default: + UNREACHABLE("Impossible."); + } + } + + __asm __volatile( + // Load up 8 elements (2 vectors) from each of 8 sources. + "LDR q0, [%[inptr0]], #16\n" + "LDR q2, [%[inptr1]], #16\n" + "FCVTL2 v1.4s, v0.8h\n" + "FCVTL v0.4s, v0.4h\n" + "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3 + "FCVTL2 v3.4s, v2.8h\n" + "FCVTL v2.4s, v2.4h\n" + "FCVTL2 v5.4s, v4.8h\n" + "FCVTL v4.4s, v4.4h\n" + "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 + ASM_PREFETCH("[%[inptr0], #128]") + "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3 + "FCVTL2 v7.4s, v6.8h\n" + "FCVTL v6.4s, v6.4h\n" + "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 + "LDR q8, [%[inptr4]], #16\n" + "LDR q10, [%[inptr5]], #16\n" + "FCVTL2 v9.4s, v8.8h\n" + "FCVTL v8.4s, v8.4h\n" ASM_PREFETCH("[%[inptr1], #128]") + "LDR q12, [%[inptr6]], #16\n" + "FCVTL2 v11.4s, v10.8h\n" + "FCVTL v10.4s, v10.4h\n" + "FCVTL2 v13.4s, v12.8h\n" + "FCVTL v12.4s, v12.4h\n" + "ZIP1 v18.4s, v8.4s, v12.4s\n" + "LDR q14, [%[inptr7]], #16\n" + "FCVTL2 v15.4s, v14.8h\n" + "FCVTL v14.4s, v14.4h\n" + "ZIP1 v19.4s, v10.4s, v14.4s\n" + + ASM_PREFETCH("[%[inptr2], #128]") + "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0 + "ZIP1 v21.4s, v18.4s, v19.4s\n" + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr3], #128]") + + "ZIP2 v16.4s, v0.4s, v4.4s\n" + "ZIP2 v17.4s, v2.4s, v6.4s\n" + "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source + + "ZIP2 v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr4], #128]") + "ZIP2 v19.4s, v10.4s, v14.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source + + "ZIP1 v20.4s, v16.4s, v17.4s\n" + "ZIP1 v21.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr5], #128]") + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" + + "ZIP1 v16.4s, v1.4s, v5.4s\n" + "ZIP1 v17.4s, v3.4s, v7.4s\n" ASM_PREFETCH("[%[inptr6], #128]") + "STP q20, q21, [%[outptr]], #32\n" // Third element + + "ZIP1 v18.4s, v9.4s, v13.4s\n" + "ZIP1 v19.4s, v11.4s, v15.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Fourth element + ASM_PREFETCH("[%[inptr7], #128]") + + "ZIP1 v20.4s, v16.4s, v17.4s\n" + "ZIP1 v21.4s, v18.4s, v19.4s\n" + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" + + "ZIP2 v16.4s, v1.4s, v5.4s\n" + "ZIP2 v17.4s, v3.4s, v7.4s\n" + "STP q20, q21, [%[outptr]], #32\n" // Fifth element + + "ZIP2 v18.4s, v9.4s, v13.4s\n" + "ZIP2 v19.4s, v11.4s, v15.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Sixth element + + "ZIP1 v20.4s, v16.4s, v17.4s\n" + "ZIP1 v21.4s, v18.4s, v19.4s\n" + "STP q20, q21, [%[outptr]], #32\n" // Seventh element + + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Eighth element + : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3), + [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); + } + + for(; x > 0; x--) + { + *outptr++ = *inptr0++; + *outptr++ = *inptr1++; + *outptr++ = *inptr2++; + *outptr++ = *inptr3++; + *outptr++ = *inptr4++; + *outptr++ = *inptr5++; + *outptr++ = *inptr6++; + *outptr++ = *inptr7++; + } + } +} + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp new file mode 100644 index 0000000000..fd6a253c6a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "transpose_interleave_common.hpp" + +// Generic unblocked transposed 6x32-bit sized specialisation +template <> +template +inline void TransformImpl<6, 1, true, 4, 4>::Transform( + T *out, const T *const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax) +{ + // Redirect to a 12 x uint16_t specialisation + TransformImpl<12, 1, true, 2, 2>::Transform( + reinterpret_cast(out), + reinterpret_cast(in), + stride * 2, x0 * 2, xmax * 2, k0, kmax); +} + +// Generic 12x16-bit sized specialisation +template <> +template +inline void TransformImpl<12, 1, true, 2, 2>::Transform( + T *out, const T *const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax) +{ + // Redirect to a uint16_t specialisation + Transform( + reinterpret_cast(out), + reinterpret_cast(in), + stride, x0, xmax, k0, kmax); +} + +// Specialised 12 x uint16_t version +template <> +inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) +{ + __asm volatile( + "LDR q0, [%[in0]]\n" + "STR q0, [%[out]]\n" + "LDR d1, [%[in0], #0x10]\n" + "STR d1, [%[out], #0x10]\n" + "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]") + : [in0] "+r"(in0), + [out] "+r"(out) + : + : "v0", "v1", "memory"); +} + +template <> +inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) +{ + __asm volatile( + "LDR q0, [%[in0]]\n" + "LDR d1, [%[in0], #0x10]\n" + "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]") + + "LDR x21, [%[in1]]\n" + "LDR q2, [%[in1], #0x08]\n" + "INS v1.d[1], x21\n" + "ADD %x[in1], %x[in1], #0x18\n" + "STP q0, q1, [%[out]]\n" + "STR q2, [%x[out], #0x20]\n" ASM_PREFETCH("[%[in1], #192]") + : [in0] "+r"(in0), + [in1] "+r"(in1), + [out] "+r"(out) + : + : "x21", "v0", "v1", "v2", "memory"); +} + +template <> +inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) +{ + __asm __volatile( + "LDR q0, [%x[in0]], #0x10\n" + "STR q0, [%x[out]]\n" + "LDR d1, [%x[in0]], #0x08\n" ASM_PREFETCH("[%[in0], #192]") + "STR d1, [%x[out], #0x10]\n" + + "LDR q0, [%x[in1]], #0x10\n" + "STR q0, [%x[out], #0x18]\n" + "LDR d1, [%x[in1]], #0x08\n" ASM_PREFETCH("[%[in1], #192]") + "STR d1, [%x[out], #0x28]\n" + + "LDR q0, [%x[in2]], #0x10\n" + "STR q0, [%x[out], #0x30]\n" + "LDR d1, [%x[in2]], #0x08\n" ASM_PREFETCH("[%[in2], #192]") + "STR d1, [%x[out], #0x40]\n" + + "LDR q0, [%x[in3]], #0x10\n" + "STR q0, [%x[out], #0x48]\n" + "LDR d1, [%x[in3]], #0x08\n" ASM_PREFETCH("[%[in3], #192]") "STR d1, [%x[out], #0x58]\n" + : [in0] "+r"(in0), + [in1] "+r"(in1), + [in2] "+r"(in2), + [in3] "+r"(in3), + [out] "+r"(out) + : + : "v0", "v1", "memory"); +} + +template <> +template <> +inline void TransformImpl<12, 1, true, 2, 2>::Transform( + uint16_t *out, const uint16_t *const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax) +{ + TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); +} + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp new file mode 100644 index 0000000000..ff1cbfb5f5 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) + +#include "transpose_interleave_common.hpp" + +template <> +inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out) +{ + __asm __volatile( + "LDR q0, [%[in0]], #16\n" + "FCVTL2 v1.4s, v0.8h\n" + "FCVTL v0.4s, v0.4h\n" + "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]") + "LDR d2, [%[in0]], #8\n" + "FCVTL v2.4s, v2.4h\n" + "STR q2, [%[out], #32]\n" + : [in0] "+r"(in0), [out] "+r"(out) + : + : "v0", "v1", "v2", "memory"); +} + +template <> +inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out) +{ + __asm __volatile( + "LDR q0, [%[in0]], #16\n" + "FCVTL2 v1.4s, v0.8h\n" + "FCVTL v0.4s, v0.4h\n" + "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]") + "LDR d2, [%[in0]], #8\n" + "FCVTL v2.4s, v2.4h\n" + "LDR q3, [%[in1]], #16\n" + "FCVTL2 v4.4s, v3.8h\n" + "FCVTL v3.4s, v3.4h\n" + "STP q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]") + "LDR d5, [%[in1]], #16\n" + "FCVTL v5.4s, v5.4h\n" + "STP q4, q5, [%[out], #64]\n" + : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "memory"); +} + +template <> +inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out) +{ + __asm __volatile( + "LDR q0, [%[in0]], #16\n" + "FCVTL2 v1.4s, v0.8h\n" + "FCVTL v0.4s, v0.4h\n" + "STP q0, q1, [%[out]]\n" + "LDR d2, [%[in0]], #8\n" ASM_PREFETCH("[%[in0], #192]") + "FCVTL v2.4s, v2.4h\n" + "LDR q3, [%[in1]], #16\n" + "FCVTL2 v4.4s, v3.8h\n" + "FCVTL v3.4s, v3.4h\n" + "STP q2, q3, [%[out], #32]\n" + "LDR d5, [%[in1]], #8\n" + "FCVTL v5.4s, v5.4h\n" ASM_PREFETCH("[%[in1], #192]") + "STP q4, q5, [%[out], #64]\n" + "LDR q6, [%[in2]], #16\n" + "FCVTL2 v7.4s, v6.8h\n" + "FCVTL v6.4s, v6.4h\n" + "STP q6, q7, [%[out], #96]\n" + "LDR d8, [%[in2]], #8\n" + "FCVTL v8.4s, v8.4h\n" ASM_PREFETCH("[%[in2], #192]") + "LDR q9, [%[in3]], #16\n" + "FCVTL2 v10.4s, v9.8h\n" + "FCVTL v9.4s, v9.4h\n" + "STP q8, q9, [%[out], #128]\n" + "LDR d11, [%[in3]], #8\n" + "FCVTL v11.4s, v11.4h\n" + "STP q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]") + + : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"); +} + +template <> +template <> +inline void TransformImpl<12, 1, true, 4, 2>::Transform( + float *out, const __fp16 *const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax) +{ + TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax); +} + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp new file mode 100644 index 0000000000..5434599f03 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "transpose_interleave_common.hpp" + +// Generic unblocked transposed 12x32-bit sized specialisation +template <> +template +inline void TransformImpl<12, 1, true, 4, 4>::Transform( + T *out, const T *const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax) +{ + // Redirect to a 24 x uint16_t specialisation + TransformImpl<24, 1, true, 2, 2>::Transform( + reinterpret_cast(out), + reinterpret_cast(in), + stride * 2, x0 * 2, xmax * 2, k0, kmax); +} + +// Generic 24x16-bit sized specialisation +template <> +template +inline void TransformImpl<24, 1, true, 2, 2>::Transform( + T *out, const T *const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax) +{ + // Redirect to a uint16_t specialisation + Transform( + reinterpret_cast(out), + reinterpret_cast(in), + stride, x0, xmax, k0, kmax); +} + +// Specialised 24 x uint16_t version +template <> +inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) +{ + __asm __volatile( + "LDP q0, q1, [%[in0]], #32\n" + "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]") + "LDR q2, [%[in0]], #16\n" + "STR q2, [%[out], #32]\n" + : [in0] "+r"(in0), [out] "+r"(out) + : + : "v0", "v1", "v2", "memory"); +} + +template <> +inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) +{ + __asm __volatile( + "LDP q0, q1, [%[in0]], #32\n" + "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]") + "LDR q2, [%[in0]], #16\n" + "LDP q3, q4, [%[in1]], #32\n" + "STP q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]") + "LDR q5, [%[in1]], #16\n" + "STP q4, q5, [%[out], #64]\n" + : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "memory"); +} + +template <> +inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) +{ + __asm __volatile( + "LDP q0, q1, [%[in0]], #32\n" + "STP q0, q1, [%[out]]\n" + "LDR q2, [%[in0]], #16\n" ASM_PREFETCH("[%[in0], #192]") + "LDP q3, q4, [%[in1]], #32\n" + "STP q2, q3, [%[out], #32]\n" + "LDR q5, [%[in1]], #16\n" ASM_PREFETCH("[%[in1], #192]") + "STP q4, q5, [%[out], #64]\n" + "LDP q6, q7, [%[in2]], #32\n" + "STP q6, q7, [%[out], #96]\n" + "LDR q8, [%[in2]], #16\n" ASM_PREFETCH("[%[in2], #192]") + "LDP q9, q10, [%[in3]], #32\n" + "STP q8, q9, [%[out], #128]\n" + "LDR q11, [%[in3]], #16\n" + "STP q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]") + + : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"); +} + +template <> +template <> +inline void TransformImpl<24, 1, true, 2, 2>::Transform( + uint16_t *out, const uint16_t *const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax) +{ + TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); +} + +#endif // __arch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp new file mode 100644 index 0000000000..8ad5b857fb --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "a32_interleave_6way_32bit.hpp" +#include "a32_transpose_interleave_8way_32bit.hpp" +#include "a64_block16_interleave4_8bit.hpp" +#include "a64_interleave_8way_16bit.hpp" +#include "a64_interleave_8way_32bit.hpp" +#include "a64_interleave_8way_half_to_float.hpp" +#include "a64_transpose_interleave_12way_16bit.hpp" +#include "a64_transpose_interleave_12way_half_to_float.hpp" +#include "a64_transpose_interleave_24way_16bit.hpp" +#include "transpose_interleave_common.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp new file mode 100644 index 0000000000..3218ca1aac --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +template +struct TransposeInterleaveCommon +{ + // Override the moveblock_1xY methods to improve performance + static inline void moveblock_1x1(const TIn *&in0, TOut *out) + { + for(unsigned int i = 0; i < IntBy; i++) + { + *out++ = static_cast(*in0++); + } + } + + static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) + { + for(unsigned int i = 0; i < IntBy; i++) + { + *out++ = static_cast(*in0++); + } + for(unsigned int i = 0; i < IntBy; i++) + { + *out++ = static_cast(*in1++); + } + } + + static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) + { + for(unsigned int i = 0; i < IntBy; i++) + { + *out++ = static_cast(*in0++); + } + for(unsigned int i = 0; i < IntBy; i++) + { + *out++ = static_cast(*in1++); + } + for(unsigned int i = 0; i < IntBy; i++) + { + *out++ = static_cast(*in2++); + } + for(unsigned int i = 0; i < IntBy; i++) + { + *out++ = static_cast(*in3++); + } + } + + static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) + { + const auto ldin = stride; + + TOut *outarray = out; + const TIn *inarray = in; + TOut *outptr_base = outarray; + const TIn *inptr_base = inarray + x0 + (k0 * ldin); + int ldout = (kmax - k0) * IntBy; + + int k = (kmax - k0); + for(; k > 3; k -= 4) + { + TOut *outptr = outptr_base; + const TIn *inptr = inptr_base; + const TIn *inptr1 = inptr + ldin; + const TIn *inptr2 = inptr1 + ldin; + const TIn *inptr3 = inptr2 + ldin; + + prefetch_3x(inptr); + prefetch_3x(inptr1); + prefetch_3x(inptr2); + prefetch_3x(inptr3); + + outptr_base += IntBy * 4; + inptr_base += ldin * 4; + + for(int x = (xmax - x0) / IntBy; x > 0; x--) + { + moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr); + outptr += ldout; + } + } + + if(k) + { + TOut *outptr = outptr_base; + const TIn *inptr = inptr_base; + const TIn *inptr1 = inptr + ldin; + const TIn *inptr2 = inptr1 + ldin; + + prefetch_3x(inptr); + prefetch_3x(inptr1); + prefetch_3x(inptr2); + + for(int x = (xmax - x0) / IntBy; x > 0; x--) + { + switch(k) + { + case 3: + moveblock_1x2(inptr, inptr1, outptr); + moveblock_1x1(inptr2, outptr + IntBy * 2); + break; + + case 2: + moveblock_1x2(inptr, inptr1, outptr); + break; + + case 1: + moveblock_1x1(inptr, outptr); + break; + + default: + UNREACHABLE("Impossible."); + } + + outptr += ldout; + } + } + + // Cope with ragged X cases + const unsigned int overflow = (xmax - x0) % IntBy; + if(overflow) + { + const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin); + TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout; + + for(int k = (kmax - k0); k > 0; k--) + { + const TIn *inptr = inptr_base; + inptr_base += ldin; + + for(unsigned int x = 0; x < IntBy; x++) + { + TOut val = (x < overflow) ? static_cast(*inptr++) : static_cast(0); + *outptr++ = val; + } + } + } + } +}; diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp new file mode 100644 index 0000000000..6c5b92ae8f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/utils.hpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +// Macro for unreachable code (e.g. impossible default cases on switch) +#define UNREACHABLE(why) __builtin_unreachable() + +// Paranoid option for the above with assert +// #define UNREACHABLE(why) assert(0 && why) + +inline int iceildiv(const int a, const int b) +{ + return (a + b - 1) / b; +} + +template +inline T roundup(const T a, const T b) +{ + T rem = a % b; + + if(rem) + { + return a + b - rem; + } + else + { + return a; + } +} diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 05907bab07..c8cba8a174 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -26,37 +26,20 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/AssemblyHelper.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/TensorAllocator.h" #include "support/ToolchainSupport.h" -namespace arm_compute -{ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wswitch-default" -#pragma GCC diagnostic ignored "-Weffc++" -#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp" -#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp" -#pragma GCC diagnostic pop -} // namespace arm_compute - #include namespace arm_compute { NEGEMM::NEGEMM(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), + : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false) { } @@ -82,42 +65,13 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe // Check if we need to reshape the matrix B only on the first run _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; + const bool run_optimised = setup_assembly_kernel(a, b, c, d, alpha, beta, _workspace, _memory_group, _asm_glue); // Check if the first input tensor is a vector. // If so, all the kernels for reshaping the tensors can be skipped if(_run_vector_matrix_multiplication) { -#if defined(__aarch64__) - if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)) - { - _mm_optimised_kernel = support::cpp14::make_unique(); - } - - if(_mm_optimised_kernel != nullptr) - { - struct CPUInfo ci = NEScheduler::get().cpu_info(); - - const int N = d->info()->tensor_shape().x(); - const int K = a->info()->tensor_shape().x(); - - size_t workbench_size = 0; - - if(a->info()->data_type() == DataType::F32) - { - workbench_size = GemvTransposed(&ci, N, K).get_working_size(); - } - - constexpr size_t alignment = 4096; - ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0"); - _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8)); - _memory_group.manage(&_workspace); - - // Configure matrix multiplication kernel - _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */); - _workspace.allocator()->allocate(); - } - else -#endif /* defined(__aarch64__) */ + if(!run_optimised) { // Configure the matrix multiply kernel _mm_kernel.configure(a, b, d, alpha, false); @@ -132,65 +86,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe } else { -#if defined(__arm__) - if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)) - { - _mm_optimised_kernel = support::cpp14::make_unique(); - } -#elif defined(__aarch64__) - if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)) - { - _mm_optimised_kernel = support::cpp14::make_unique(); - } - else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f)) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - _mm_optimised_kernel = support::cpp14::make_unique(); -#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16."); -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - } -#endif /* defined(__arm__) || defined(__aarch64__) */ - -#if defined(__arm__) || defined(__aarch64__) - if(_mm_optimised_kernel != nullptr) - { - struct CPUInfo ci = NEScheduler::get().cpu_info(); - - const int M = d->info()->tensor_shape().y(); - const int N = d->info()->tensor_shape().x(); - const int K = a->info()->tensor_shape().x(); - - size_t workbench_size = 0; - -#if defined(__arm__) - workbench_size = GemmInterleaved(&ci, M, N, K, false, false).get_working_size(); -#elif defined(__aarch64__) - if(a->info()->data_type() == DataType::F32) - { - workbench_size = GemmInterleaved(&ci, M, N, K, false, false).get_working_size(); - } - else if(a->info()->data_type() == DataType::F16) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - workbench_size = GemmInterleaved(&ci, M, N, K, false, false).get_working_size(); -#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16."); -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - } -#endif /* defined(__arm__) || defined(__aarch64__) */ - - constexpr size_t alignment = 4096; - ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0"); - _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8)); - _memory_group.manage(&_workspace); - - // Configure matrix multiplication kernel - _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */); - _workspace.allocator()->allocate(); - } - else -#endif /* defined(__arm__) || defined(__aarch64__) */ + if(!run_optimised) { TensorShape shape_tmp_a = a->info()->tensor_shape(); TensorShape shape_tmp_b = b->info()->tensor_shape(); @@ -243,9 +139,9 @@ void NEGEMM::run() { _memory_group.acquire(); - if(_mm_optimised_kernel != nullptr) + if(_asm_glue._optimised_kernel != nullptr) { - NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY); + _asm_glue.run(); _memory_group.release(); } else diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index a85078cf71..3b8b4243e5 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -23,9 +23,6 @@ */ #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h" -#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Utils.h" @@ -34,13 +31,6 @@ #include "arm_compute/runtime/NEON/NEScheduler.h" #include "support/ToolchainSupport.h" -namespace arm_compute -{ -#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp" -} // namespace arm_compute - #include #include @@ -226,8 +216,8 @@ Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInf } // namespace NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr &memory_manager) - : _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _mm_gemmlowp(memory_manager), - _gemmlowp_output_stage(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(), _workspace(), _append_bias(false), + : _asm_glue(), _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), + _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(), _workspace(), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false), _is_interleaved(false) { } @@ -256,25 +246,6 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w } } -void NEGEMMConvolutionLayer::configure_asm_mm(const struct CPUInfo &ci, int M, int N, int K) -{ - ARM_COMPUTE_UNUSED(ci); - ARM_COMPUTE_UNUSED(M); - ARM_COMPUTE_UNUSED(N); - ARM_COMPUTE_UNUSED(K); -#if defined(__arm__) || defined(__aarch64__) -#if defined(__arm__) - GemmInterleaved gemm(&ci, M, N, K, false, false); -#elif defined(__aarch64__) - GemmInterleaved gemm(&ci, M, N, K, false, false); -#endif /* defined(__arm__) || defined(__aarch64__) */ - - constexpr size_t alignment = 4096; - _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8)); - _memory_group.manage(&_workspace); -#endif /* defined(__arm__) || defined(__aarch64__) */ -} - void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info) { // Perform validate step @@ -298,20 +269,11 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig const unsigned int fixed_point_position = input->info()->fixed_point_position(); const ITensor *biases_to_use = (_append_bias) ? biases : nullptr; -#if defined(__arm__) - if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32) - { - _mm_optimised_kernel = support::cpp14::make_unique(); - } -#elif defined(__aarch64__) - if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32) - { - _mm_optimised_kernel = support::cpp14::make_unique(); - } -#endif /* defined(__arm__) || defined(__aarch64__) */ + bool run_optimised = + (NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32) || (NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32); // Reshape weights if needed - if(_mm_optimised_kernel != nullptr) + if(run_optimised) { if(_are_weights_reshaped) { @@ -378,7 +340,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig _memory_group.manage(&_input_im2col_reshaped); // Create tensor (interleave) to prepare input tensor for GEMM - if(!_is_fully_connected_convolution && _mm_optimised_kernel == nullptr) + if(!_is_fully_connected_convolution && !run_optimised) { TensorShape shape_interleaved(shape_im2col); shape_interleaved.set(0, shape_interleaved.x() * 4); @@ -403,29 +365,10 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias); // Configure matrix multiply - if(_mm_optimised_kernel != nullptr) + if(run_optimised) { - struct CPUInfo ci = NEScheduler::get().cpu_info(); - - const int M = _gemm_output.info()->tensor_shape().y(); - const int N = _gemm_output.info()->tensor_shape().x(); - const int K = _input_im2col_reshaped.info()->tensor_shape().x(); - -#if defined(__aarch64__) - if((N <= 128) && (K <= 128)) - { - _mm_optimised_kernel = support::cpp14::make_unique(); - } - else -#endif /* defined(__aarch64__) */ - { - configure_asm_mm(ci, M, N, K); - } - - // Configure matrix multiplication kernel - _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace); - - _workspace.allocator()->allocate(); + run_optimised = setup_assembly_kernel(&_input_im2col_reshaped, weights, nullptr, &_gemm_output, 1.f, 0.f, _workspace, _memory_group, _asm_glue); + ARM_COMPUTE_ERROR_ON_MSG(run_optimised == false, "setup_assembly_kernel failed."); } else { @@ -615,9 +558,9 @@ void NEGEMMConvolutionLayer::run() NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY); // Runs matrix multiply on reshaped matrices - if(_mm_optimised_kernel != nullptr) + if(_asm_glue._optimised_kernel != nullptr) { - NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY); + _asm_glue.run(); } else { diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp index 9b36e81afd..e5e97910d8 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2017 ARM Limited. +/* Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -25,13 +25,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" @@ -39,20 +35,11 @@ #include "arm_compute/runtime/TensorAllocator.h" #include "support/ToolchainSupport.h" -namespace arm_compute -{ -#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp" -} // namespace arm_compute - using namespace arm_compute; NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(), _workspace() + : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(), + _workspace() { } @@ -65,89 +52,28 @@ void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITe ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A"); ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B"); + bool run_optimised = false; #ifdef __aarch64__ - const int M = output->info()->tensor_shape().y(); - const int N = output->info()->tensor_shape().x(); - const int K = a->info()->tensor_shape().x(); - constexpr size_t workspace_alignment = 4096; - const struct CPUInfo ci = NEScheduler::get().cpu_info(); -#endif /* __aarch64__ */ - -#ifdef ARM_COMPUTE_AARCH64_V8_2 - if(ci.CPU == CPUTarget::A75_DOT || ci.CPU == CPUTarget::A55_DOT) - { - // Configure matrix multiply kernel - GemmInterleaved gemm(&ci, M, N, K, false, false); - _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8)); - _memory_group.manage(&_workspace); - - // Configure matrix multiplication kernel - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(a, b, output, &_workspace, 1.f, 1.f); - _mm_kernel = std::move(k); - _workspace.allocator()->allocate(); - } - else -#elif defined(ARM_COMPUTE_AARCH64_V8A) - if(ci.CPU == CPUTarget::A53) + switch(a->info()->data_type()) { - switch(a->info()->data_type()) + case DataType::S8: { - case DataType::S8: - { - // Configure matrix multiply kernel - GemmInterleaved gemm(&ci, M, N, K, false, false); - _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8)); - } + run_optimised = setup_assembly_kernel(a, b, nullptr, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_signed); break; - case DataType::U8: - { - // Configure matrix multiply kernel - GemmInterleaved gemm(&ci, M, N, K, false, false); - _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8)); - } - break; - default: - ARM_COMPUTE_ERROR("Datatype not supported"); } - - _memory_group.manage(&_workspace); - // Configure matrix multiplication kernel - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(a, b, output, &_workspace, 1.f, 1.f); - _mm_kernel = std::move(k); - _workspace.allocator()->allocate(); - } - else if(1) // Generic v8a kernel - { - switch(a->info()->data_type()) + case DataType::U8: { - case DataType::S8: - { - // Configure matrix multiply kernel - GemmInterleaved gemm(&ci, M, N, K, false, false); - _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8)); - } + run_optimised = setup_assembly_kernel(a, b, nullptr, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_unsigned); break; - case DataType::U8: - { - // Configure matrix multiply kernel - GemmInterleaved gemm(&ci, M, N, K, false, false); - _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8)); - } + } + default: + { + ARM_COMPUTE_ERROR("Datatype not supported"); break; - default: - ARM_COMPUTE_ERROR("Datatype not supported"); } - _memory_group.manage(&_workspace); - // Configure matrix multiplication kernel - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(a, b, output, &_workspace, 1.f, 1.f); - _mm_kernel = std::move(k); - _workspace.allocator()->allocate(); } - else -#endif /* ARM_COMPUTE_AARCH64_V8_2 */ +#endif /* __aarch64__ */ + if(!run_optimised) { // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] TensorShape shape_tmp_a = a->info()->tensor_shape(); @@ -206,7 +132,18 @@ void NEGEMMLowpAssemblyMatrixMultiplyCore::run() NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); } - NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); + if(_asm_glue_unsigned._optimised_kernel != nullptr) + { + _asm_glue_unsigned.run(); + } + else if(_asm_glue_signed._optimised_kernel != nullptr) + { + _asm_glue_signed.run(); + } + else + { + NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); + } _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index ad47593f20..dc4ed5cefb 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -26,11 +26,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" @@ -39,18 +37,13 @@ #include "arm_compute/runtime/TensorAllocator.h" #include "support/ToolchainSupport.h" -namespace arm_compute -{ -#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp" -#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp" -} // namespace arm_compute - using namespace arm_compute; using namespace arm_compute::misc::shape_calculator; NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), - _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false) + : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), + _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), + _dot_product_path(false) { } @@ -64,33 +57,27 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, _b_offset = b->info()->quantization_info().offset; _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; -#ifdef ARM_COMPUTE_AARCH64_V8_2 - // Check for DOT product instruction - const struct CPUInfo ci = NEScheduler::get().cpu_info(); - const int cpu_has_dotprod = static_cast(ci.CPU) & static_cast(CPUTarget::DOT); - - if(cpu_has_dotprod != 0) +#ifdef __aarch64__ + switch(a->info()->data_type()) { - _dot_product_path = true; - - // Configure matrix multiply kernel - struct CPUInfo ci = NEScheduler::get().cpu_info(); - const int M = output->info()->tensor_shape().y(); - const int N = output->info()->tensor_shape().x(); - const int K = a->info()->tensor_shape().x(); - - const size_t workbench_size = GemmInterleaved(&ci, M, N, K, false, false).get_working_size(); - constexpr size_t alignment = 4096; - _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8)); - _memory_group.manage(&_workspace); - - // Configure matrix multiplication kernel - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(a, b, output, &_workspace, 1.f, 1.f, false, false); - _mm_kernel = std::move(k); + case DataType::S8: + { + _dot_product_path = setup_assembly_kernel(a, b, nullptr, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_signed); + break; + } + case DataType::U8: + { + _dot_product_path = setup_assembly_kernel(a, b, nullptr, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_unsigned); + break; + } + default: + { + ARM_COMPUTE_ERROR("Datatype not supported"); + break; + } } - else -#endif /* ARM_COMPUTE_AARCH64_V8_2 */ +#endif /* __aarch64__ */ + if(!_dot_product_path) { if(_run_vector_matrix_multiplication) { @@ -203,42 +190,28 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso int32_t b_offset = b->quantization_info().offset; bool run_vector_matrix_multiplication = a->dimension(1) < 2; -#ifdef ARM_COMPUTE_AARCH64_V8_2 - // Check for DOT product instruction - const struct CPUInfo ci = NEScheduler::get().cpu_info(); - const int cpu_has_dotprod = static_cast(ci.CPU) & static_cast(CPUTarget::DOT); - - if(cpu_has_dotprod != 0) + if(!run_vector_matrix_multiplication) { - // Validate matrix multiply kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output)); + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] + TensorShape shape_tmp_a = a->tensor_shape(); + shape_tmp_a.set(0, a->dimension(0) * 4); + shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); + + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] + TensorShape shape_tmp_b = b->tensor_shape(); + shape_tmp_b.set(0, b->dimension(1) * 16); + shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); + + TensorInfo info_a(shape_tmp_a, 1, a->data_type()); + TensorInfo info_b(shape_tmp_b, 1, b->data_type()); + + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output)); } else -#endif /* ARM_COMPUTE_AARCH64_V8_2 */ { - if(!run_vector_matrix_multiplication) - { - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - TensorShape shape_tmp_a = a->tensor_shape(); - shape_tmp_a.set(0, a->dimension(0) * 4); - shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); - - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] - TensorShape shape_tmp_b = b->tensor_shape(); - shape_tmp_b.set(0, b->dimension(1) * 16); - shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); - - TensorInfo info_a(shape_tmp_a, 1, a->data_type()); - TensorInfo info_b(shape_tmp_b, 1, b->data_type()); - - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output)); - } + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output)); } TensorInfo info_vector_sum_col, info_vector_sum_row; @@ -288,7 +261,18 @@ void NEGEMMLowpMatrixMultiplyCore::run() } } - NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); + if(_asm_glue_unsigned._optimised_kernel != nullptr) + { + _asm_glue_unsigned.run(); + } + else if(_asm_glue_signed._optimised_kernel != nullptr) + { + _asm_glue_signed.run(); + } + else + { + NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); + } // Run matrix A reduction kernel only if _b_offset is not equal to 0 if(_b_offset != 0) diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp index a901b442ab..471ec3faaf 100644 --- a/tests/validation/NEON/GEMMLowp.cpp +++ b/tests/validation/NEON/GEMMLowp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h" -#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" -- cgit v1.2.1