diff options
author | Anthony Barbier <anthony.barbier@arm.com> | 2018-07-03 16:22:02 +0100 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:54:10 +0000 |
commit | 5f707736413aeac77818c42838296966f8dc6761 (patch) | |
tree | b829ed3243ea5f3085f288836132416c78bc2e72 /src/core/NEON/kernels/arm_gemm/transforms | |
parent | 7485d5a62685cb745ab50e970adb722cb71557ac (diff) | |
download | ComputeLibrary-5f707736413aeac77818c42838296966f8dc6761.tar.gz |
COMPMID-1369: Revert accidental formatting of RSH's repo
Pulled latest fixes from David's repo:
commit f43ebe932c84083332b0b1a0348241b69dda63a7
Author: David Mansell <David.Mansell@arm.com>
Date: Tue Jul 3 18:09:01 2018 +0100
Whitespace tidying, fixed comment in gemv_batched imported from ACL.
Change-Id: Ie37a623f44e90d88072236cb853ac55ac82d5f51
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/138530
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: David Mansell <david.mansell@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/transforms')
10 files changed, 650 insertions, 639 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp index 501d6bf075..e485ca7009 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp @@ -29,17 +29,15 @@ #include "../asmlib.hpp" -template <> -template <typename T> -inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) -{ - uint32_t *outptr = reinterpret_cast<uint32_t *>(out); - const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in); +template<> +template<typename T> +inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { + uint32_t *outptr = reinterpret_cast<uint32_t *>(out); + const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in); uint32_t zerobuff[8]; - for(int y = y0; y < ymax; y += 6) - { + for (int y=y0; y<ymax; y+=6) { const uint32_t *inptr0 = inptr + y * ldin + k0; const uint32_t *inptr1 = inptr0 + ldin; const uint32_t *inptr2 = inptr1 + ldin; @@ -54,14 +52,11 @@ inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int //prefetch_2x(inptr4); //prefetch_2x(inptr5); - int x = (kmax - k0); - for(; x > 7; x -= 8) - { + int x=(kmax-k0); + for (;x>7;x-=8) { /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if((y + 5) >= ymax) - { - switch((y + 5) - ymax) - { + if ((y + 5) >= ymax) { + switch ((y + 5) - ymax) { /* Everything falls through in here */ case 4: inptr1 = zerobuff; @@ -80,67 +75,73 @@ inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int } } - __asm __volatile( + + __asm __volatile ( // Load up 8 elements (2 vectors) from each of 8 sources. - "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3 - "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3 - "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3 - "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3 - "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3 - "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3 - "VLD1.32 {d16-d19}, [%[inptr4]]!\n" - "VLD1.32 {d20-d23}, [%[inptr5]]!\n" - "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3 + "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3 + "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3 + "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3 + "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3 + "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3 + "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3 + "VLD1.32 {d16-d19}, [%[inptr4]]!\n" + "VLD1.32 {d20-d23}, [%[inptr5]]!\n" + "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3 ASM_PREFETCH("[%[inptr0], #128]") - "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1 + "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1 // Store first elements - "VST1.32 {d0-d1}, [%[outptr]]!\n" - "VST1.32 {d16}, [%[outptr]]!\n" + "VST1.32 {d0-d1}, [%[outptr]]!\n" + "VST1.32 {d16}, [%[outptr]]!\n" - "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3 + "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3 // Store second elements - "VST1.32 {d4-d5}, [%[outptr]]!\n" - "VZIP.32 q1, q5\n" ASM_PREFETCH("[%[inptr1], #128]") - "VST1.32 {d17}, [%[outptr]]!\n" - "VZIP.32 q3, q7\n" + "VST1.32 {d4-d5}, [%[outptr]]!\n" + "VZIP.32 q1, q5\n" + ASM_PREFETCH("[%[inptr1], #128]") + "VST1.32 {d17}, [%[outptr]]!\n" + "VZIP.32 q3, q7\n" // Store third elements - "VZIP.32 q9, q11\n" - "VST1.32 {d8-d9}, [%[outptr]]!\n" - "VZIP.32 q1, q3\n" ASM_PREFETCH("[%[inptr2], #128]") - "VST1.32 {d20}, [%[outptr]]!\n" + "VZIP.32 q9, q11\n" + "VST1.32 {d8-d9}, [%[outptr]]!\n" + "VZIP.32 q1, q3\n" + ASM_PREFETCH("[%[inptr2], #128]") + "VST1.32 {d20}, [%[outptr]]!\n" // Store fourth elements - "VZIP.32 q5, q7\n" - "VST1.32 {d12-d13}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr3], #128]") - "VST1.32 {d21}, [%[outptr]]!\n" + "VZIP.32 q5, q7\n" + "VST1.32 {d12-d13}, [%[outptr]]!\n" + ASM_PREFETCH("[%[inptr3], #128]") + "VST1.32 {d21}, [%[outptr]]!\n" // Fifth - "VST1.32 {d2-d3}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr4], #128]") - "VST1.32 {d18}, [%[outptr]]!\n" + "VST1.32 {d2-d3}, [%[outptr]]!\n" + ASM_PREFETCH("[%[inptr4], #128]") + "VST1.32 {d18}, [%[outptr]]!\n" // Sixth - "VST1.32 {d6-d7}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr5], #128]") - "VST1.32 {d19}, [%[outptr]]!\n" + "VST1.32 {d6-d7}, [%[outptr]]!\n" + ASM_PREFETCH("[%[inptr5], #128]") + "VST1.32 {d19}, [%[outptr]]!\n" // Seventh - "VST1.32 {d10-d11}, [%[outptr]]!\n" - "VST1.32 {d22}, [%[outptr]]!\n" + "VST1.32 {d10-d11}, [%[outptr]]!\n" + "VST1.32 {d22}, [%[outptr]]!\n" // Eighth - "VST1.32 {d14-d15}, [%[outptr]]!\n" - "VST1.32 {d23}, [%[outptr]]!\n" + "VST1.32 {d14-d15}, [%[outptr]]!\n" + "VST1.32 {d23}, [%[outptr]]!\n" - : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3), - [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [outptr] "+r"(outptr) + : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), + [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr) : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12"); + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12" + ); } - for(; x > 0; x--) - { + for (;x>0;x--) { *outptr++ = *inptr0++; *outptr++ = *inptr1++; *outptr++ = *inptr2++; @@ -151,4 +152,4 @@ inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int } } -#endif // __arm__ +#endif // __arm__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp index ea32c9665c..a7e17fa074 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp @@ -31,86 +31,97 @@ template <> template <typename T> inline void TransformImpl<8, 1, true, 4, 4>::Transform( - T *out, const T *const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax) -{ - // Redirect to a 16x uint16_t specialisation - TransformImpl<16, 1, true, 2, 2>::Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t *const>(in), - stride * 2, x0 * 2, xmax * 2, k0, kmax); + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a 16x uint16_t specialisation + TransformImpl<16, 1, true, 2, 2>::Transform( + reinterpret_cast<uint16_t *>(out), + reinterpret_cast<const uint16_t * const>(in), + stride*2, x0*2, xmax*2, k0, kmax + ); } // Generic 12x16-bit sized specialisation template <> template <typename T> inline void TransformImpl<16, 1, true, 2, 2>::Transform( - T *out, const T *const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax) -{ - // Redirect to a uint16_t specialisation - Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t *const>(in), - stride, x0, xmax, k0, kmax); + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a uint16_t specialisation + Transform( + reinterpret_cast<uint16_t *>(out), + reinterpret_cast<const uint16_t * const>(in), + stride, x0, xmax, k0, kmax + ); } // Specialised 16 x uint16_t version template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) -{ - __asm volatile( - "VLD1.32 {d0-d3}, [%[in0]]!\n" - "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]") - : [in0] "+r"(in0), - [out] "+r"(out) - : - : "q0", "q1", "memory"); +inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { + __asm volatile ( + "VLD1.32 {d0-d3}, [%[in0]]!\n" + "VST1.32 {d0-d3}, [%[out]]\n" + ASM_PREFETCH("[%[in0], #192]") + : [in0] "+r" (in0), + [out] "+r" (out) + : + : "q0", "q1", "memory" + ); } template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) -{ - __asm volatile( - "VLD1.32 {d0-d3}, [%[in0]]!\n" - "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]") - "VLD1.32 {d0-d3}, [%[in1]]!\n" - "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in1], #192]") "SUB %[out], %[out], #32\n" - : [in0] "+r"(in0), - [in1] "+r"(in1), - [out] "+r"(out) - : - : "q0", "q1", "memory"); +inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) { + __asm volatile ( + "VLD1.32 {d0-d3}, [%[in0]]!\n" + "VST1.32 {d0-d3}, [%[out]]!\n" + ASM_PREFETCH("[%[in0], #192]") + "VLD1.32 {d0-d3}, [%[in1]]!\n" + "VST1.32 {d0-d3}, [%[out]]\n" + ASM_PREFETCH("[%[in1], #192]") + "SUB %[out], %[out], #32\n" + : [in0] "+r" (in0), + [in1] "+r" (in1), + [out] "+r" (out) + : + : "q0", "q1", "memory" + ); } template <> -inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) -{ - __asm __volatile( - "VLD1.32 {d0-d3}, [%[in0]]!\n" - "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]") - "VLD1.32 {d0-d3}, [%[in1]]!\n" - "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in1], #192]") - "VLD1.32 {d0-d3}, [%[in2]]!\n" - "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in2], #192]") - "VLD1.32 {d0-d3}, [%[in3]]!\n" - "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in3], #192]") "SUB %[out], %[out], #96\n" - : [in0] "+r"(in0), - [in1] "+r"(in1), - [in2] "+r"(in2), - [in3] "+r"(in3), - [out] "+r"(out) - : - : "q0", "q1", "memory"); +inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { + __asm __volatile ( + "VLD1.32 {d0-d3}, [%[in0]]!\n" + "VST1.32 {d0-d3}, [%[out]]!\n" + ASM_PREFETCH("[%[in0], #192]") + "VLD1.32 {d0-d3}, [%[in1]]!\n" + "VST1.32 {d0-d3}, [%[out]]!\n" + ASM_PREFETCH("[%[in1], #192]") + "VLD1.32 {d0-d3}, [%[in2]]!\n" + "VST1.32 {d0-d3}, [%[out]]!\n" + ASM_PREFETCH("[%[in2], #192]") + "VLD1.32 {d0-d3}, [%[in3]]!\n" + "VST1.32 {d0-d3}, [%[out]]\n" + ASM_PREFETCH("[%[in3], #192]") + "SUB %[out], %[out], #96\n" + : [in0] "+r" (in0), + [in1] "+r" (in1), + [in2] "+r" (in2), + [in3] "+r" (in3), + [out] "+r" (out) + : + : "q0", "q1", "memory" + ); } template <> template <> inline void TransformImpl<16, 1, true, 2, 2>::Transform( - uint16_t *out, const uint16_t *const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax) -{ - TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); + uint16_t* out, const uint16_t* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); } #endif // __arm__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp index 8d61f15cec..7e61f425d4 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp @@ -30,17 +30,15 @@ #include "../asmlib.hpp" #include "../utils.hpp" -template <> -template <typename T> -void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) -{ - uint8_t *outptr = (uint8_t *)out; - const uint8_t *inptr = (uint8_t *)in; +template<> +template<typename T> +void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { + uint8_t *outptr = (uint8_t *)out; + const uint8_t *inptr = (uint8_t *)in; uint8_t zerobuff[16]; - for(int y = y0; y < ymax; y += 4) - { + for (int y=y0; y<ymax; y+=4) { const uint8_t *inptr0 = inptr + y * ldin + k0; const uint8_t *inptr1 = inptr0 + ldin; const uint8_t *inptr2 = inptr1 + ldin; @@ -51,14 +49,11 @@ void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, prefetch_2x(inptr2); prefetch_2x(inptr3); - int x = (kmax - k0); - for(; x > 15; x -= 16) - { + int x=(kmax-k0); + for (;x>15;x-=16) { /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if((y + 3) >= ymax) - { - switch((y + 3) - ymax) - { + if ((y + 3) >= ymax) { + switch ((y + 3) - ymax) { /* Everything falls through in here */ case 2: inptr1 = zerobuff; @@ -73,23 +68,28 @@ void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, } } - __asm __volatile( - "LDR q0, [%[inptr0]], #16\n" ASM_PREFETCH("[%[inptr0], #176]") "LDR q1, [%[inptr1]], #16\n" ASM_PREFETCH("[%[inptr1], #176]") - "STP q0, q1, [%[outptr]], #32\n" - "LDR q0, [%[inptr2]], #16\n" ASM_PREFETCH("[%[inptr2], #176]") "LDR q1, [%[inptr3]], #16\n" ASM_PREFETCH("[%[inptr3], #176]") "STP q0, q1, [%[outptr]], #32\n" - : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3), - [outptr] "+r"(outptr) + __asm __volatile ( + "LDR q0, [%[inptr0]], #16\n" + ASM_PREFETCH("[%[inptr0], #176]") + "LDR q1, [%[inptr1]], #16\n" + ASM_PREFETCH("[%[inptr1], #176]") + "STP q0, q1, [%[outptr]], #32\n" + "LDR q0, [%[inptr2]], #16\n" + ASM_PREFETCH("[%[inptr2], #176]") + "LDR q1, [%[inptr3]], #16\n" + ASM_PREFETCH("[%[inptr3], #176]") + "STP q0, q1, [%[outptr]], #32\n" + : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), + [outptr] "+r" (outptr) : - : "v0", "v1"); + : "v0", "v1" + ); } - if(x > 0) - { + if (x>0) { /* Need to duplicate this here, in case we didn't run the main loop. */ - if((y + 3) >= ymax) - { - switch((y + 3) - ymax) - { + if ((y + 3) >= ymax) { + switch ((y + 3) - ymax) { /* Everything falls through in here */ case 2: inptr1 = zerobuff; @@ -105,16 +105,11 @@ void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, } /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */ - auto f = [&outptr, x](const uint8_t *&p) - { - for(int i = 0; i < 16; i++) - { - if(i < x) - { + auto f = [&outptr, x](const uint8_t *&p) { + for (int i=0; i<16; i++) { + if (i < x) { *outptr++ = *p++; - } - else - { + } else { *outptr++ = 0; } } @@ -128,4 +123,4 @@ void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, } } -#endif // __aarch64__
\ No newline at end of file +#endif // __aarch64__
\ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp index 3cbc8815e3..99bb2d66bd 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp @@ -29,17 +29,15 @@ #include "../asmlib.hpp" -template <> -template <typename T> -void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) -{ - uint16_t *outptr = (uint16_t *)out; - const uint16_t *inptr = (const uint16_t *)in; +template<> +template<typename T> +void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { + uint16_t *outptr = (uint16_t *)out; + const uint16_t *inptr = (const uint16_t *)in; uint16_t zerobuff[24]; - for(int y = y0; y < ymax; y += 8) - { + for (int y=y0; y<ymax; y+=8) { const uint16_t *inptr0 = inptr + y * ldin + k0; const uint16_t *inptr1 = inptr0 + ldin; const uint16_t *inptr2 = inptr1 + ldin; @@ -58,14 +56,11 @@ void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, prefetch_2x(inptr6); prefetch_2x(inptr7); - int x = (kmax - k0); - for(; x > 7; x -= 8) - { + int x=(kmax-k0); + for (;x>7;x-=8) { /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if((y + 7) >= ymax) - { - switch((y + 7) - ymax) - { + if ((y + 7) >= ymax) { + switch ((y + 7) - ymax) { /* Everything falls through in here */ case 6: inptr1 = zerobuff; @@ -89,72 +84,74 @@ void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, } int skippf = (x & 31); - __asm __volatile( + __asm __volatile ( // Load up 8 elements (1 vector) from each of 8 sources. - "CBNZ %w[skippf], 1f\n" ASM_PREFETCH("[%[inptr0], #128]") + "CBNZ %w[skippf], 1f\n" + ASM_PREFETCH("[%[inptr0], #128]") ASM_PREFETCH("[%[inptr1], #128]") ASM_PREFETCH("[%[inptr2], #128]") ASM_PREFETCH("[%[inptr3], #128]") "1:\n" - "LDR q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7 - "LDR q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7 - "LDR q2, [%[inptr2]], #16\n" // q4=C0C1C2C3... - "LDR q6, [%[inptr6]], #16\n" - "ZIP1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3 - "ZIP2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7 - "ZIP1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3 - "ZIP2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7 - "LDR q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7 - "LDR q5, [%[inptr5]], #16\n" - "LDR q3, [%[inptr3]], #16\n" // q3=D0D1D2D3.... - "LDR q7, [%[inptr7]], #16\n" - "ZIP1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3 - "ZIP2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7 - "ZIP1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3 - "ZIP2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7 - - "ZIP1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1 - "ZIP2 v20.8h, v8.8h, v9.8h\n" - "ZIP1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1 - "ZIP2 v21.8h, v10.8h, v11.8h\n" - - "CBNZ %w[skippf], 2f\n" ASM_PREFETCH("[%[inptr4], #112]") + "LDR q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7 + "LDR q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7 + "LDR q2, [%[inptr2]], #16\n" // q4=C0C1C2C3... + "LDR q6, [%[inptr6]], #16\n" + "ZIP1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3 + "ZIP2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7 + "ZIP1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3 + "ZIP2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7 + "LDR q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7 + "LDR q5, [%[inptr5]], #16\n" + "LDR q3, [%[inptr3]], #16\n" // q3=D0D1D2D3.... + "LDR q7, [%[inptr7]], #16\n" + "ZIP1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3 + "ZIP2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7 + "ZIP1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3 + "ZIP2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7 + + "ZIP1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1 + "ZIP2 v20.8h, v8.8h, v9.8h\n" + "ZIP1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1 + "ZIP2 v21.8h, v10.8h, v11.8h\n" + + "CBNZ %w[skippf], 2f\n" + ASM_PREFETCH("[%[inptr4], #112]") ASM_PREFETCH("[%[inptr5], #112]") ASM_PREFETCH("[%[inptr6], #112]") ASM_PREFETCH("[%[inptr7], #112]") "2:\n" - "ZIP1 v22.8h, v16.8h, v17.8h\n" - "ZIP2 v30.8h, v16.8h, v17.8h\n" - "ZIP1 v23.8h, v18.8h, v19.8h\n" - "ZIP2 v31.8h, v18.8h, v19.8h\n" - - "ZIP1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0 - "ZIP2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1 - "STP q14, q15, [%[outptr]], #32\n" // Write back first two elements - - "ZIP1 v0.8h, v20.8h, v21.8h\n" - "ZIP2 v1.8h, v20.8h, v21.8h\n" - "STP q0, q1, [%[outptr]], #32\n" // Write back next two elements - - "ZIP1 v2.8h, v22.8h, v23.8h\n" - "ZIP2 v3.8h, v22.8h, v23.8h\n" - "STP q2, q3, [%[outptr]], #32\n" // Write back next two elements - - "ZIP1 v4.8h, v30.8h, v31.8h\n" - "ZIP2 v5.8h, v30.8h, v31.8h\n" - "STP q4, q5, [%[outptr]], #32\n" // Write back last two elements - : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3), - [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr) - : [skippf] "r"(skippf) + "ZIP1 v22.8h, v16.8h, v17.8h\n" + "ZIP2 v30.8h, v16.8h, v17.8h\n" + "ZIP1 v23.8h, v18.8h, v19.8h\n" + "ZIP2 v31.8h, v18.8h, v19.8h\n" + + "ZIP1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0 + "ZIP2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1 + "STP q14, q15, [%[outptr]], #32\n" // Write back first two elements + + "ZIP1 v0.8h, v20.8h, v21.8h\n" + "ZIP2 v1.8h, v20.8h, v21.8h\n" + "STP q0, q1, [%[outptr]], #32\n" // Write back next two elements + + "ZIP1 v2.8h, v22.8h, v23.8h\n" + "ZIP2 v3.8h, v22.8h, v23.8h\n" + "STP q2, q3, [%[outptr]], #32\n" // Write back next two elements + + "ZIP1 v4.8h, v30.8h, v31.8h\n" + "ZIP2 v5.8h, v30.8h, v31.8h\n" + "STP q4, q5, [%[outptr]], #32\n" // Write back last two elements + : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), + [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) + : [skippf] "r" (skippf) : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); } - for(; x > 0; x--) - { + for (;x>0;x--) { *outptr++ = *inptr0++; *outptr++ = *inptr1++; *outptr++ = *inptr2++; diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp index 47e4fa2608..83391cc59f 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp @@ -29,17 +29,15 @@ #include "../asmlib.hpp" -template <> -template <typename T> -inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) -{ - uint32_t *outptr = (uint32_t *)out; - const uint32_t *inptr = (uint32_t *)in; +template<> +template<typename T> +inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { + uint32_t *outptr = (uint32_t *)out; + const uint32_t *inptr = (uint32_t *)in; uint32_t zerobuff[8]; - for(int y = y0; y < ymax; y += 8) - { + for (int y=y0; y<ymax; y+=8) { const uint32_t *inptr0 = inptr + y * ldin + k0; const uint32_t *inptr1 = inptr0 + ldin; const uint32_t *inptr2 = inptr1 + ldin; @@ -58,14 +56,11 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int prefetch_2x(inptr6); prefetch_2x(inptr7); - int x = (kmax - k0); - for(; x > 7; x -= 8) - { + int x=(kmax-k0); + for (;x>7;x-=8) { /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if((y + 7) >= ymax) - { - switch((y + 7) - ymax) - { + if ((y + 7) >= ymax) { + switch ((y + 7) - ymax) { /* Everything falls through in here */ case 6: inptr1 = zerobuff; @@ -88,19 +83,20 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int } } - __asm __volatile( + __asm __volatile ( // Load up 8 elements (2 vectors) from each of 8 sources. "LDP q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3 "LDP q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3 "LDP q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3 - "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 + "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 ASM_PREFETCH("[%[inptr0], #128]") "LDP q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3 - "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 + "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 "LDP q8, q9, [%[inptr4]], #32\n" "LDP q10, q11, [%[inptr5]], #32\n" "LDP q12, q13, [%[inptr6]], #32\n" - "ZIP1 v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr1], #128]") + "ZIP1 v18.4s, v8.4s, v12.4s\n" + ASM_PREFETCH("[%[inptr1], #128]") "LDP q14, q15, [%[inptr7]], #32\n" "ZIP1 v19.4s, v10.4s, v14.4s\n" @@ -110,7 +106,8 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int "ZIP2 v22.4s, v16.4s, v17.4s\n" "ZIP2 v23.4s, v18.4s, v19.4s\n" - "ZIP2 v16.4s, v0.4s, v4.4s\n" ASM_PREFETCH("[%[inptr3], #128]") + "ZIP2 v16.4s, v0.4s, v4.4s\n" + ASM_PREFETCH("[%[inptr3], #128]") "ZIP2 v17.4s, v2.4s, v6.4s\n" "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source @@ -118,12 +115,14 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int "ZIP2 v19.4s, v10.4s, v14.4s\n" "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source - "ZIP1 v20.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr4], #128]") + "ZIP1 v20.4s, v16.4s, v17.4s\n" + ASM_PREFETCH("[%[inptr4], #128]") "ZIP1 v21.4s, v18.4s, v19.4s\n" "ZIP2 v22.4s, v16.4s, v17.4s\n" "ZIP2 v23.4s, v18.4s, v19.4s\n" - "ZIP1 v16.4s, v1.4s, v5.4s\n" ASM_PREFETCH("[%[inptr5], #128]") + "ZIP1 v16.4s, v1.4s, v5.4s\n" + ASM_PREFETCH("[%[inptr5], #128]") "ZIP1 v17.4s, v3.4s, v7.4s\n" "STP q20, q21, [%[outptr]], #32\n" // Third element @@ -133,14 +132,16 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int "ZIP1 v20.4s, v16.4s, v17.4s\n" "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr6], #128]") + "ZIP2 v22.4s, v16.4s, v17.4s\n" + ASM_PREFETCH("[%[inptr6], #128]") "ZIP2 v23.4s, v18.4s, v19.4s\n" "ZIP2 v16.4s, v1.4s, v5.4s\n" "ZIP2 v17.4s, v3.4s, v7.4s\n" "STP q20, q21, [%[outptr]], #32\n" // Fifth element - "ZIP2 v18.4s, v9.4s, v13.4s\n" ASM_PREFETCH("[%[inptr7], #128]") + "ZIP2 v18.4s, v9.4s, v13.4s\n" + ASM_PREFETCH("[%[inptr7], #128]") "ZIP2 v19.4s, v11.4s, v15.4s\n" "STP q22, q23, [%[outptr]], #32\n" // Sixth element @@ -151,15 +152,15 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int "ZIP2 v22.4s, v16.4s, v17.4s\n" "ZIP2 v23.4s, v18.4s, v19.4s\n" "STP q22, q23, [%[outptr]], #32\n" // Eighth element - : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3), - [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr) + : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), + [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) : : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + ); } - for(; x > 0; x--) - { + for (;x>0;x--) { *outptr++ = *inptr0++; *outptr++ = *inptr1++; *outptr++ = *inptr2++; @@ -172,4 +173,4 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int } } -#endif // __aarch64__ +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp index 1d2d4969f6..fd812165fd 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp @@ -29,17 +29,15 @@ #include "../asmlib.hpp" -template <> -template <> -inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax) -{ - float *outptr = out; - const __fp16 *inptr = in; +template<> +template<> +inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax) { + float *outptr = out; + const __fp16 *inptr = in; __fp16 zerobuff[8]; - for(int y = y0; y < ymax; y += 8) - { + for (int y=y0; y<ymax; y+=8) { const __fp16 *inptr0 = inptr + y * ldin + k0; const __fp16 *inptr1 = inptr0 + ldin; const __fp16 *inptr2 = inptr1 + ldin; @@ -58,14 +56,11 @@ inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 prefetch_2x(inptr6); prefetch_2x(inptr7); - int x = (kmax - k0); - for(; x > 7; x -= 8) - { + int x=(kmax-k0); + for (;x>7;x-=8) { /* Cope with ragged cases by copying from a buffer of zeroes instead */ - if((y + 7) >= ymax) - { - switch((y + 7) - ymax) - { + if ((y + 7) >= ymax) { + switch ((y + 7) - ymax) { /* Everything falls through in here */ case 6: inptr1 = zerobuff; @@ -88,95 +83,100 @@ inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 } } - __asm __volatile( + __asm __volatile ( // Load up 8 elements (2 vectors) from each of 8 sources. - "LDR q0, [%[inptr0]], #16\n" - "LDR q2, [%[inptr1]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3 - "FCVTL2 v3.4s, v2.8h\n" - "FCVTL v2.4s, v2.4h\n" - "FCVTL2 v5.4s, v4.8h\n" - "FCVTL v4.4s, v4.4h\n" - "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 + "LDR q0, [%[inptr0]], #16\n" + "LDR q2, [%[inptr1]], #16\n" + "FCVTL2 v1.4s, v0.8h\n" + "FCVTL v0.4s, v0.4h\n" + "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3 + "FCVTL2 v3.4s, v2.8h\n" + "FCVTL v2.4s, v2.4h\n" + "FCVTL2 v5.4s, v4.8h\n" + "FCVTL v4.4s, v4.4h\n" + "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1 ASM_PREFETCH("[%[inptr0], #128]") - "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3 - "FCVTL2 v7.4s, v6.8h\n" - "FCVTL v6.4s, v6.4h\n" - "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 - "LDR q8, [%[inptr4]], #16\n" - "LDR q10, [%[inptr5]], #16\n" - "FCVTL2 v9.4s, v8.8h\n" - "FCVTL v8.4s, v8.4h\n" ASM_PREFETCH("[%[inptr1], #128]") - "LDR q12, [%[inptr6]], #16\n" - "FCVTL2 v11.4s, v10.8h\n" - "FCVTL v10.4s, v10.4h\n" - "FCVTL2 v13.4s, v12.8h\n" - "FCVTL v12.4s, v12.4h\n" - "ZIP1 v18.4s, v8.4s, v12.4s\n" - "LDR q14, [%[inptr7]], #16\n" - "FCVTL2 v15.4s, v14.8h\n" - "FCVTL v14.4s, v14.4h\n" - "ZIP1 v19.4s, v10.4s, v14.4s\n" + "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3 + "FCVTL2 v7.4s, v6.8h\n" + "FCVTL v6.4s, v6.4h\n" + "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1 + "LDR q8, [%[inptr4]], #16\n" + "LDR q10, [%[inptr5]], #16\n" + "FCVTL2 v9.4s, v8.8h\n" + "FCVTL v8.4s, v8.4h\n" + ASM_PREFETCH("[%[inptr1], #128]") + "LDR q12, [%[inptr6]], #16\n" + "FCVTL2 v11.4s, v10.8h\n" + "FCVTL v10.4s, v10.4h\n" + "FCVTL2 v13.4s, v12.8h\n" + "FCVTL v12.4s, v12.4h\n" + "ZIP1 v18.4s, v8.4s, v12.4s\n" + "LDR q14, [%[inptr7]], #16\n" + "FCVTL2 v15.4s, v14.8h\n" + "FCVTL v14.4s, v14.4h\n" + "ZIP1 v19.4s, v10.4s, v14.4s\n" ASM_PREFETCH("[%[inptr2], #128]") - "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0 - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr3], #128]") - - "ZIP2 v16.4s, v0.4s, v4.4s\n" - "ZIP2 v17.4s, v2.4s, v6.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source - - "ZIP2 v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr4], #128]") - "ZIP2 v19.4s, v10.4s, v14.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source - - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr5], #128]") - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - - "ZIP1 v16.4s, v1.4s, v5.4s\n" - "ZIP1 v17.4s, v3.4s, v7.4s\n" ASM_PREFETCH("[%[inptr6], #128]") - "STP q20, q21, [%[outptr]], #32\n" // Third element - - "ZIP1 v18.4s, v9.4s, v13.4s\n" - "ZIP1 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Fourth element + "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0 + "ZIP1 v21.4s, v18.4s, v19.4s\n" + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" + ASM_PREFETCH("[%[inptr3], #128]") + + "ZIP2 v16.4s, v0.4s, v4.4s\n" + "ZIP2 v17.4s, v2.4s, v6.4s\n" + "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source + + "ZIP2 v18.4s, v8.4s, v12.4s\n" + ASM_PREFETCH("[%[inptr4], #128]") + "ZIP2 v19.4s, v10.4s, v14.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source + + "ZIP1 v20.4s, v16.4s, v17.4s\n" + "ZIP1 v21.4s, v18.4s, v19.4s\n" + ASM_PREFETCH("[%[inptr5], #128]") + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" + + "ZIP1 v16.4s, v1.4s, v5.4s\n" + "ZIP1 v17.4s, v3.4s, v7.4s\n" + ASM_PREFETCH("[%[inptr6], #128]") + "STP q20, q21, [%[outptr]], #32\n" // Third element + + "ZIP1 v18.4s, v9.4s, v13.4s\n" + "ZIP1 v19.4s, v11.4s, v15.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Fourth element ASM_PREFETCH("[%[inptr7], #128]") - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" + "ZIP1 v20.4s, v16.4s, v17.4s\n" + "ZIP1 v21.4s, v18.4s, v19.4s\n" + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" - "ZIP2 v16.4s, v1.4s, v5.4s\n" - "ZIP2 v17.4s, v3.4s, v7.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Fifth element + "ZIP2 v16.4s, v1.4s, v5.4s\n" + "ZIP2 v17.4s, v3.4s, v7.4s\n" + "STP q20, q21, [%[outptr]], #32\n" // Fifth element - "ZIP2 v18.4s, v9.4s, v13.4s\n" - "ZIP2 v19.4s, v11.4s, v15.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Sixth element + "ZIP2 v18.4s, v9.4s, v13.4s\n" + "ZIP2 v19.4s, v11.4s, v15.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Sixth element - "ZIP1 v20.4s, v16.4s, v17.4s\n" - "ZIP1 v21.4s, v18.4s, v19.4s\n" - "STP q20, q21, [%[outptr]], #32\n" // Seventh element + "ZIP1 v20.4s, v16.4s, v17.4s\n" + "ZIP1 v21.4s, v18.4s, v19.4s\n" + "STP q20, q21, [%[outptr]], #32\n" // Seventh element - "ZIP2 v22.4s, v16.4s, v17.4s\n" - "ZIP2 v23.4s, v18.4s, v19.4s\n" - "STP q22, q23, [%[outptr]], #32\n" // Eighth element - : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3), - [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr) + "ZIP2 v22.4s, v16.4s, v17.4s\n" + "ZIP2 v23.4s, v18.4s, v19.4s\n" + "STP q22, q23, [%[outptr]], #32\n" // Eighth element + : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), + [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) : : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23" + ); } - for(; x > 0; x--) - { + for (;x>0;x--) { *outptr++ = *inptr0++; *outptr++ = *inptr1++; *outptr++ = *inptr2++; diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp index fd6a253c6a..6e07064a0c 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp @@ -31,105 +31,115 @@ template <> template <typename T> inline void TransformImpl<6, 1, true, 4, 4>::Transform( - T *out, const T *const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax) -{ - // Redirect to a 12 x uint16_t specialisation - TransformImpl<12, 1, true, 2, 2>::Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t *const>(in), - stride * 2, x0 * 2, xmax * 2, k0, kmax); + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a 12 x uint16_t specialisation + TransformImpl<12, 1, true, 2, 2>::Transform( + reinterpret_cast<uint16_t *>(out), + reinterpret_cast<const uint16_t * const>(in), + stride*2, x0*2, xmax*2, k0, kmax + ); } // Generic 12x16-bit sized specialisation template <> template <typename T> inline void TransformImpl<12, 1, true, 2, 2>::Transform( - T *out, const T *const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax) -{ - // Redirect to a uint16_t specialisation - Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t *const>(in), - stride, x0, xmax, k0, kmax); + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a uint16_t specialisation + Transform( + reinterpret_cast<uint16_t *>(out), + reinterpret_cast<const uint16_t * const>(in), + stride, x0, xmax, k0, kmax + ); } // Specialised 12 x uint16_t version template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) -{ - __asm volatile( - "LDR q0, [%[in0]]\n" - "STR q0, [%[out]]\n" - "LDR d1, [%[in0], #0x10]\n" - "STR d1, [%[out], #0x10]\n" - "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]") - : [in0] "+r"(in0), - [out] "+r"(out) - : - : "v0", "v1", "memory"); +inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { + __asm volatile ( + "LDR q0, [%[in0]]\n" + "STR q0, [%[out]]\n" + "LDR d1, [%[in0], #0x10]\n" + "STR d1, [%[out], #0x10]\n" + "ADD %x[in0], %x[in0], #0x18\n" + ASM_PREFETCH("[%[in0], #192]") + : [in0] "+r" (in0), + [out] "+r" (out) + : + : "v0", "v1", "memory" + ); } template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) -{ - __asm volatile( - "LDR q0, [%[in0]]\n" - "LDR d1, [%[in0], #0x10]\n" - "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]") +inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) { + __asm volatile ( + "LDR q0, [%[in0]]\n" + "LDR d1, [%[in0], #0x10]\n" + "ADD %x[in0], %x[in0], #0x18\n" + ASM_PREFETCH("[%[in0], #192]") - "LDR x21, [%[in1]]\n" - "LDR q2, [%[in1], #0x08]\n" - "INS v1.d[1], x21\n" - "ADD %x[in1], %x[in1], #0x18\n" - "STP q0, q1, [%[out]]\n" - "STR q2, [%x[out], #0x20]\n" ASM_PREFETCH("[%[in1], #192]") - : [in0] "+r"(in0), - [in1] "+r"(in1), - [out] "+r"(out) - : - : "x21", "v0", "v1", "v2", "memory"); + "LDR x21, [%[in1]]\n" + "LDR q2, [%[in1], #0x08]\n" + "INS v1.d[1], x21\n" + "ADD %x[in1], %x[in1], #0x18\n" + "STP q0, q1, [%[out]]\n" + "STR q2, [%x[out], #0x20]\n" + ASM_PREFETCH("[%[in1], #192]") + : [in0] "+r" (in0), + [in1] "+r" (in1), + [out] "+r" (out) + : + : "x21", "v0", "v1", "v2", "memory" + ); } template <> -inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) -{ - __asm __volatile( - "LDR q0, [%x[in0]], #0x10\n" - "STR q0, [%x[out]]\n" - "LDR d1, [%x[in0]], #0x08\n" ASM_PREFETCH("[%[in0], #192]") - "STR d1, [%x[out], #0x10]\n" +inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { + __asm __volatile ( + "LDR q0, [%x[in0]], #0x10\n" + "STR q0, [%x[out]]\n" + "LDR d1, [%x[in0]], #0x08\n" + ASM_PREFETCH("[%[in0], #192]") + "STR d1, [%x[out], #0x10]\n" - "LDR q0, [%x[in1]], #0x10\n" - "STR q0, [%x[out], #0x18]\n" - "LDR d1, [%x[in1]], #0x08\n" ASM_PREFETCH("[%[in1], #192]") - "STR d1, [%x[out], #0x28]\n" + "LDR q0, [%x[in1]], #0x10\n" + "STR q0, [%x[out], #0x18]\n" + "LDR d1, [%x[in1]], #0x08\n" + ASM_PREFETCH("[%[in1], #192]") + "STR d1, [%x[out], #0x28]\n" - "LDR q0, [%x[in2]], #0x10\n" - "STR q0, [%x[out], #0x30]\n" - "LDR d1, [%x[in2]], #0x08\n" ASM_PREFETCH("[%[in2], #192]") - "STR d1, [%x[out], #0x40]\n" + "LDR q0, [%x[in2]], #0x10\n" + "STR q0, [%x[out], #0x30]\n" + "LDR d1, [%x[in2]], #0x08\n" + ASM_PREFETCH("[%[in2], #192]") + "STR d1, [%x[out], #0x40]\n" - "LDR q0, [%x[in3]], #0x10\n" - "STR q0, [%x[out], #0x48]\n" - "LDR d1, [%x[in3]], #0x08\n" ASM_PREFETCH("[%[in3], #192]") "STR d1, [%x[out], #0x58]\n" - : [in0] "+r"(in0), - [in1] "+r"(in1), - [in2] "+r"(in2), - [in3] "+r"(in3), - [out] "+r"(out) - : - : "v0", "v1", "memory"); + "LDR q0, [%x[in3]], #0x10\n" + "STR q0, [%x[out], #0x48]\n" + "LDR d1, [%x[in3]], #0x08\n" + ASM_PREFETCH("[%[in3], #192]") + "STR d1, [%x[out], #0x58]\n" + : [in0] "+r" (in0), + [in1] "+r" (in1), + [in2] "+r" (in2), + [in3] "+r" (in3), + [out] "+r" (out) + : + : "v0", "v1", "memory" + ); } template <> template <> inline void TransformImpl<12, 1, true, 2, 2>::Transform( - uint16_t *out, const uint16_t *const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax) -{ - TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); + uint16_t* out, const uint16_t* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); } #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp index b79f32fb8b..2f90c18ebd 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp @@ -28,86 +28,93 @@ #include "transpose_interleave_common.hpp" template <> -inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out) -{ - __asm __volatile( +inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out) { + __asm __volatile ( "LDR q0, [%[in0]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]") + "FCVTL2 v1.4s, v0.8h\n" + "FCVTL v0.4s, v0.4h\n" + "STP q0, q1, [%[out]]\n" + ASM_PREFETCH("[%[in0], #192]") "LDR d2, [%[in0]], #8\n" - "FCVTL v2.4s, v2.4h\n" + "FCVTL v2.4s, v2.4h\n" "STR q2, [%[out], #32]\n" - : [in0] "+r"(in0), [out] "+r"(out) - : - : "v0", "v1", "v2", "memory"); + : [in0] "+r" (in0), [out] "+r" (out) + : + : "v0", "v1", "v2", "memory" + ); } template <> -inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out) -{ - __asm __volatile( +inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out) { + __asm __volatile ( "LDR q0, [%[in0]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" - "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]") + "FCVTL2 v1.4s, v0.8h\n" + "FCVTL v0.4s, v0.4h\n" + "STP q0, q1, [%[out]]\n" + ASM_PREFETCH("[%[in0], #192]") "LDR d2, [%[in0]], #8\n" - "FCVTL v2.4s, v2.4h\n" - "LDR q3, [%[in1]], #16\n" - "FCVTL2 v4.4s, v3.8h\n" - "FCVTL v3.4s, v3.4h\n" - "STP q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]") - "LDR d5, [%[in1]], #16\n" - "FCVTL v5.4s, v5.4h\n" + "FCVTL v2.4s, v2.4h\n" + "LDR q3, [%[in1]], #16\n" + "FCVTL2 v4.4s, v3.8h\n" + "FCVTL v3.4s, v3.4h\n" + "STP q2, q3, [%[out], #32]\n" + ASM_PREFETCH("[%[in1], #192]") + "LDR d5, [%[in1]], #16\n" + "FCVTL v5.4s, v5.4h\n" "STP q4, q5, [%[out], #64]\n" - : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "memory"); + : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "memory" + ); } template <> -inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out) -{ - __asm __volatile( +inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out) { + __asm __volatile ( "LDR q0, [%[in0]], #16\n" - "FCVTL2 v1.4s, v0.8h\n" - "FCVTL v0.4s, v0.4h\n" + "FCVTL2 v1.4s, v0.8h\n" + "FCVTL v0.4s, v0.4h\n" "STP q0, q1, [%[out]]\n" - "LDR d2, [%[in0]], #8\n" ASM_PREFETCH("[%[in0], #192]") - "FCVTL v2.4s, v2.4h\n" - "LDR q3, [%[in1]], #16\n" - "FCVTL2 v4.4s, v3.8h\n" - "FCVTL v3.4s, v3.4h\n" + "LDR d2, [%[in0]], #8\n" + ASM_PREFETCH("[%[in0], #192]") + "FCVTL v2.4s, v2.4h\n" + "LDR q3, [%[in1]], #16\n" + "FCVTL2 v4.4s, v3.8h\n" + "FCVTL v3.4s, v3.4h\n" "STP q2, q3, [%[out], #32]\n" - "LDR d5, [%[in1]], #8\n" - "FCVTL v5.4s, v5.4h\n" ASM_PREFETCH("[%[in1], #192]") + "LDR d5, [%[in1]], #8\n" + "FCVTL v5.4s, v5.4h\n" + ASM_PREFETCH("[%[in1], #192]") "STP q4, q5, [%[out], #64]\n" - "LDR q6, [%[in2]], #16\n" - "FCVTL2 v7.4s, v6.8h\n" - "FCVTL v6.4s, v6.4h\n" + "LDR q6, [%[in2]], #16\n" + "FCVTL2 v7.4s, v6.8h\n" + "FCVTL v6.4s, v6.4h\n" "STP q6, q7, [%[out], #96]\n" - "LDR d8, [%[in2]], #8\n" - "FCVTL v8.4s, v8.4h\n" ASM_PREFETCH("[%[in2], #192]") - "LDR q9, [%[in3]], #16\n" - "FCVTL2 v10.4s, v9.8h\n" - "FCVTL v9.4s, v9.4h\n" + "LDR d8, [%[in2]], #8\n" + "FCVTL v8.4s, v8.4h\n" + ASM_PREFETCH("[%[in2], #192]") + "LDR q9, [%[in3]], #16\n" + "FCVTL2 v10.4s, v9.8h\n" + "FCVTL v9.4s, v9.4h\n" "STP q8, q9, [%[out], #128]\n" - "LDR d11, [%[in3]], #8\n" - "FCVTL v11.4s, v11.4h\n" - "STP q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]") + "LDR d11, [%[in3]], #8\n" + "FCVTL v11.4s, v11.4h\n" + "STP q10, q11, [%[out], #160]\n" + ASM_PREFETCH("[%[in3], #192]") - : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"); + : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory" + ); } template <> template <> inline void TransformImpl<12, 1, true, 4, 2>::Transform( - float *out, const __fp16 *const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax) -{ - TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax); + float* out, const __fp16* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax); } #endif // __aarch64__ && __ARM_FP16_ARGS diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp index 5434599f03..b6565baa23 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp @@ -31,91 +31,100 @@ template <> template <typename T> inline void TransformImpl<12, 1, true, 4, 4>::Transform( - T *out, const T *const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax) -{ - // Redirect to a 24 x uint16_t specialisation - TransformImpl<24, 1, true, 2, 2>::Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t *const>(in), - stride * 2, x0 * 2, xmax * 2, k0, kmax); + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a 24 x uint16_t specialisation + TransformImpl<24, 1, true, 2, 2>::Transform( + reinterpret_cast<uint16_t *>(out), + reinterpret_cast<const uint16_t * const>(in), + stride*2, x0*2, xmax*2, k0, kmax + ); } // Generic 24x16-bit sized specialisation template <> template <typename T> inline void TransformImpl<24, 1, true, 2, 2>::Transform( - T *out, const T *const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax) -{ - // Redirect to a uint16_t specialisation - Transform( - reinterpret_cast<uint16_t *>(out), - reinterpret_cast<const uint16_t *const>(in), - stride, x0, xmax, k0, kmax); + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a uint16_t specialisation + Transform( + reinterpret_cast<uint16_t *>(out), + reinterpret_cast<const uint16_t * const>(in), + stride, x0, xmax, k0, kmax + ); } // Specialised 24 x uint16_t version template <> -inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) -{ - __asm __volatile( +inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { + __asm __volatile ( "LDP q0, q1, [%[in0]], #32\n" - "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]") + "STP q0, q1, [%[out]]\n" + ASM_PREFETCH("[%[in0], #192]") "LDR q2, [%[in0]], #16\n" "STR q2, [%[out], #32]\n" - : [in0] "+r"(in0), [out] "+r"(out) - : - : "v0", "v1", "v2", "memory"); + : [in0] "+r" (in0), [out] "+r" (out) + : + : "v0", "v1", "v2", "memory" + ); } template <> -inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) -{ - __asm __volatile( +inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1,uint16_t *out) { + __asm __volatile ( "LDP q0, q1, [%[in0]], #32\n" - "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]") + "STP q0, q1, [%[out]]\n" + ASM_PREFETCH("[%[in0], #192]") "LDR q2, [%[in0]], #16\n" - "LDP q3, q4, [%[in1]], #32\n" - "STP q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]") - "LDR q5, [%[in1]], #16\n" + "LDP q3, q4, [%[in1]], #32\n" + "STP q2, q3, [%[out], #32]\n" + ASM_PREFETCH("[%[in1], #192]") + "LDR q5, [%[in1]], #16\n" "STP q4, q5, [%[out], #64]\n" - : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "memory"); + : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "memory" + ); } template <> -inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) -{ - __asm __volatile( +inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { + __asm __volatile ( "LDP q0, q1, [%[in0]], #32\n" "STP q0, q1, [%[out]]\n" - "LDR q2, [%[in0]], #16\n" ASM_PREFETCH("[%[in0], #192]") - "LDP q3, q4, [%[in1]], #32\n" + "LDR q2, [%[in0]], #16\n" + ASM_PREFETCH("[%[in0], #192]") + "LDP q3, q4, [%[in1]], #32\n" "STP q2, q3, [%[out], #32]\n" - "LDR q5, [%[in1]], #16\n" ASM_PREFETCH("[%[in1], #192]") + "LDR q5, [%[in1]], #16\n" + ASM_PREFETCH("[%[in1], #192]") "STP q4, q5, [%[out], #64]\n" - "LDP q6, q7, [%[in2]], #32\n" + "LDP q6, q7, [%[in2]], #32\n" "STP q6, q7, [%[out], #96]\n" - "LDR q8, [%[in2]], #16\n" ASM_PREFETCH("[%[in2], #192]") - "LDP q9, q10, [%[in3]], #32\n" + "LDR q8, [%[in2]], #16\n" + ASM_PREFETCH("[%[in2], #192]") + "LDP q9, q10, [%[in3]], #32\n" "STP q8, q9, [%[out], #128]\n" - "LDR q11, [%[in3]], #16\n" - "STP q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]") + "LDR q11, [%[in3]], #16\n" + "STP q10, q11, [%[out], #160]\n" + ASM_PREFETCH("[%[in3], #192]") - : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"); + : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory" + ); } template <> template <> inline void TransformImpl<24, 1, true, 2, 2>::Transform( - uint16_t *out, const uint16_t *const in, const int stride, - const int x0, const int xmax, const int k0, const int kmax) -{ - TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); + uint16_t* out, const uint16_t* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); } -#endif // __arch64__ +#endif // __arch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp index 3218ca1aac..63e85c155a 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp @@ -24,137 +24,117 @@ #pragma once template <unsigned int IntBy, typename TIn, typename TOut> -struct TransposeInterleaveCommon -{ - // Override the moveblock_1xY methods to improve performance - static inline void moveblock_1x1(const TIn *&in0, TOut *out) - { - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast<TOut>(*in0++); - } +struct TransposeInterleaveCommon { + // Override the moveblock_1xY methods to improve performance + static inline void moveblock_1x1(const TIn *&in0, TOut *out) { + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast<TOut>(*in0++); } + } - static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) - { - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast<TOut>(*in0++); - } - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast<TOut>(*in1++); - } + static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) { + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast<TOut>(*in0++); + } + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast<TOut>(*in1++); } + } - static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) - { - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast<TOut>(*in0++); - } - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast<TOut>(*in1++); - } - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast<TOut>(*in2++); - } - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast<TOut>(*in3++); + static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) { + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast<TOut>(*in0++); + } + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast<TOut>(*in1++); + } + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast<TOut>(*in2++); + } + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast<TOut>(*in3++); + } + } + + static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) { + const auto ldin = stride; + + TOut *outarray = out; + const TIn *inarray = in; + TOut *outptr_base = outarray; + const TIn *inptr_base = inarray + x0 + (k0 * ldin); + int ldout = (kmax - k0) * IntBy; + + int k=(kmax-k0); + for ( ; k>3; k-=4) { + TOut *outptr = outptr_base; + const TIn *inptr = inptr_base; + const TIn *inptr1 = inptr + ldin; + const TIn *inptr2 = inptr1 + ldin; + const TIn *inptr3 = inptr2 + ldin; + + prefetch_3x(inptr); + prefetch_3x(inptr1); + prefetch_3x(inptr2); + prefetch_3x(inptr3); + + outptr_base += IntBy * 4; + inptr_base += ldin * 4; + + for (int x = (xmax-x0) / IntBy; x > 0 ; x--) { + moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr); + outptr += ldout; } } - static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) - { - const auto ldin = stride; - - TOut *outarray = out; - const TIn *inarray = in; - TOut *outptr_base = outarray; - const TIn *inptr_base = inarray + x0 + (k0 * ldin); - int ldout = (kmax - k0) * IntBy; - - int k = (kmax - k0); - for(; k > 3; k -= 4) - { - TOut *outptr = outptr_base; - const TIn *inptr = inptr_base; - const TIn *inptr1 = inptr + ldin; - const TIn *inptr2 = inptr1 + ldin; - const TIn *inptr3 = inptr2 + ldin; - - prefetch_3x(inptr); - prefetch_3x(inptr1); - prefetch_3x(inptr2); - prefetch_3x(inptr3); - - outptr_base += IntBy * 4; - inptr_base += ldin * 4; - - for(int x = (xmax - x0) / IntBy; x > 0; x--) - { - moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr); - outptr += ldout; + if (k) { + TOut *outptr = outptr_base; + const TIn *inptr = inptr_base; + const TIn *inptr1 = inptr + ldin; + const TIn *inptr2 = inptr1 + ldin; + + prefetch_3x(inptr); + prefetch_3x(inptr1); + prefetch_3x(inptr2); + + for (int x = (xmax-x0) / IntBy; x > 0 ; x--) { + switch(k) { + case 3: + moveblock_1x2(inptr, inptr1, outptr); + moveblock_1x1(inptr2, outptr + IntBy * 2); + break; + + case 2: + moveblock_1x2(inptr, inptr1, outptr); + break; + + case 1: + moveblock_1x1(inptr, outptr); + break; + + default: + UNREACHABLE("Impossible."); } - } - if(k) - { - TOut *outptr = outptr_base; - const TIn *inptr = inptr_base; - const TIn *inptr1 = inptr + ldin; - const TIn *inptr2 = inptr1 + ldin; - - prefetch_3x(inptr); - prefetch_3x(inptr1); - prefetch_3x(inptr2); - - for(int x = (xmax - x0) / IntBy; x > 0; x--) - { - switch(k) - { - case 3: - moveblock_1x2(inptr, inptr1, outptr); - moveblock_1x1(inptr2, outptr + IntBy * 2); - break; - - case 2: - moveblock_1x2(inptr, inptr1, outptr); - break; - - case 1: - moveblock_1x1(inptr, outptr); - break; - - default: - UNREACHABLE("Impossible."); - } - - outptr += ldout; - } + outptr += ldout; } + } + + // Cope with ragged X cases + const unsigned int overflow = (xmax - x0) % IntBy; + if (overflow) { + const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin); + TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout; + + for (int k=(kmax-k0); k>0; k--) { + const TIn *inptr = inptr_base; + inptr_base += ldin; - // Cope with ragged X cases - const unsigned int overflow = (xmax - x0) % IntBy; - if(overflow) - { - const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin); - TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout; - - for(int k = (kmax - k0); k > 0; k--) - { - const TIn *inptr = inptr_base; - inptr_base += ldin; - - for(unsigned int x = 0; x < IntBy; x++) - { - TOut val = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0); - *outptr++ = val; - } + for (unsigned int x=0; x < IntBy; x++) { + TOut val = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0); + *outptr++ = val; } } } +} }; |