diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp | 158 |
1 files changed, 84 insertions, 74 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp index fd6a253c6a..6e07064a0c 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp @@ -31,105 +31,115 @@ template <> template <typename T> inline void TransformImpl<6, 1, true, 4, 4>::Transform( - T *out, const T *const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax) -{ - // Redirect to a 12 x uint16_t specialisation - TransformImpl<12, 1, true, 2, 2>::Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t *const>(in), - stride * 2, x0 * 2, xmax * 2, k0, kmax); + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a 12 x uint16_t specialisation + TransformImpl<12, 1, true, 2, 2>::Transform( + reinterpret_cast<uint16_t *>(out), + reinterpret_cast<const uint16_t * const>(in), + stride*2, x0*2, xmax*2, k0, kmax + ); } // Generic 12x16-bit sized specialisation template <> template <typename T> inline void TransformImpl<12, 1, true, 2, 2>::Transform( - T *out, const T *const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax) -{ - // Redirect to a uint16_t specialisation - Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t *const>(in), - stride, x0, xmax, k0, kmax); + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a uint16_t specialisation + Transform( + reinterpret_cast<uint16_t *>(out), + reinterpret_cast<const uint16_t * const>(in), + stride, x0, xmax, k0, kmax + ); } // Specialised 12 x uint16_t version template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) -{ - __asm volatile( - "LDR q0, [%[in0]]\n" - "STR q0, [%[out]]\n" - "LDR d1, [%[in0], #0x10]\n" - "STR d1, [%[out], #0x10]\n" - "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]") - : [in0] "+r"(in0), - [out] "+r"(out) - : - : "v0", "v1", "memory"); +inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { + __asm volatile ( + "LDR q0, [%[in0]]\n" + "STR q0, [%[out]]\n" + "LDR d1, [%[in0], #0x10]\n" + "STR d1, [%[out], #0x10]\n" + "ADD %x[in0], %x[in0], #0x18\n" + ASM_PREFETCH("[%[in0], #192]") + : [in0] "+r" (in0), + [out] "+r" (out) + : + : "v0", "v1", "memory" + ); } template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) -{ - __asm volatile( - "LDR q0, [%[in0]]\n" - "LDR d1, [%[in0], #0x10]\n" - "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]") +inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) { + __asm volatile ( + "LDR q0, [%[in0]]\n" + "LDR d1, [%[in0], #0x10]\n" + "ADD %x[in0], %x[in0], #0x18\n" + ASM_PREFETCH("[%[in0], #192]") - "LDR x21, [%[in1]]\n" - "LDR q2, [%[in1], #0x08]\n" - "INS v1.d[1], x21\n" - "ADD %x[in1], %x[in1], #0x18\n" - "STP q0, q1, [%[out]]\n" - "STR q2, [%x[out], #0x20]\n" ASM_PREFETCH("[%[in1], #192]") - : [in0] "+r"(in0), - [in1] "+r"(in1), - [out] "+r"(out) - : - : "x21", "v0", "v1", "v2", "memory"); + "LDR x21, [%[in1]]\n" + "LDR q2, [%[in1], #0x08]\n" + "INS v1.d[1], x21\n" + "ADD %x[in1], %x[in1], #0x18\n" + "STP q0, q1, [%[out]]\n" + "STR q2, [%x[out], #0x20]\n" + ASM_PREFETCH("[%[in1], #192]") + : [in0] "+r" (in0), + [in1] "+r" (in1), + [out] "+r" (out) + : + : "x21", "v0", "v1", "v2", "memory" + ); } template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) -{ - __asm __volatile( - "LDR q0, [%x[in0]], #0x10\n" - "STR q0, [%x[out]]\n" - "LDR d1, [%x[in0]], #0x08\n" ASM_PREFETCH("[%[in0], #192]") - "STR d1, [%x[out], #0x10]\n" +inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { + __asm __volatile ( + "LDR q0, [%x[in0]], #0x10\n" + "STR q0, [%x[out]]\n" + "LDR d1, [%x[in0]], #0x08\n" + ASM_PREFETCH("[%[in0], #192]") + "STR d1, [%x[out], #0x10]\n" - "LDR q0, [%x[in1]], #0x10\n" - "STR q0, [%x[out], #0x18]\n" - "LDR d1, [%x[in1]], #0x08\n" ASM_PREFETCH("[%[in1], #192]") - "STR d1, [%x[out], #0x28]\n" + "LDR q0, [%x[in1]], #0x10\n" + "STR q0, [%x[out], #0x18]\n" + "LDR d1, [%x[in1]], #0x08\n" + ASM_PREFETCH("[%[in1], #192]") + "STR d1, [%x[out], #0x28]\n" - "LDR q0, [%x[in2]], #0x10\n" - "STR q0, [%x[out], #0x30]\n" - "LDR d1, [%x[in2]], #0x08\n" ASM_PREFETCH("[%[in2], #192]") - "STR d1, [%x[out], #0x40]\n" + "LDR q0, [%x[in2]], #0x10\n" + "STR q0, [%x[out], #0x30]\n" + "LDR d1, [%x[in2]], #0x08\n" + ASM_PREFETCH("[%[in2], #192]") + "STR d1, [%x[out], #0x40]\n" - "LDR q0, [%x[in3]], #0x10\n" - "STR q0, [%x[out], #0x48]\n" - "LDR d1, [%x[in3]], #0x08\n" ASM_PREFETCH("[%[in3], #192]") "STR d1, [%x[out], #0x58]\n" - : [in0] "+r"(in0), - [in1] "+r"(in1), - [in2] "+r"(in2), - [in3] "+r"(in3), - [out] "+r"(out) - : - : "v0", "v1", "memory"); + "LDR q0, [%x[in3]], #0x10\n" + "STR q0, [%x[out], #0x48]\n" + "LDR d1, [%x[in3]], #0x08\n" + ASM_PREFETCH("[%[in3], #192]") + "STR d1, [%x[out], #0x58]\n" + : [in0] "+r" (in0), + [in1] "+r" (in1), + [in2] "+r" (in2), + [in3] "+r" (in3), + [out] "+r" (out) + : + : "v0", "v1", "memory" + ); } template <> template <> inline void TransformImpl<12, 1, true, 2, 2>::Transform( - uint16_t *out, const uint16_t *const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax) -{ - TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); + uint16_t* out, const uint16_t* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); } #endif // __aarch64__ |