diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/asmlib.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/asmlib.hpp | 85 |
1 files changed, 43 insertions, 42 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/asmlib.hpp b/src/core/NEON/kernels/arm_gemm/asmlib.hpp index b3fcb33bfb..38f51ae72c 100644 --- a/src/core/NEON/kernels/arm_gemm/asmlib.hpp +++ b/src/core/NEON/kernels/arm_gemm/asmlib.hpp @@ -31,21 +31,21 @@ // used by the workaround. // "Correct" version -#define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n" -#define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n" -#define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n" +#define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n" +#define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n" +#define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n" #define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n" // Lee's uarchsim hack -//#define ASM_PREFETCH(address) "LDNP x20, x21, " address "\n" +//#define ASM_PREFETCH(address) "LDNP x20, x21, " address "\n" // No preload at all //#define ASM_PREFETCH(address) "" #else // "Correct" versions for AArch32 -#define ASM_PREFETCH(address) "PLD " address "\n" -#define ASM_PREFETCHW(address) "PLDW " address "\n" +#define ASM_PREFETCH(address) "PLD " address "\n" +#define ASM_PREFETCHW(address) "PLDW " address "\n" #endif @@ -53,76 +53,77 @@ * Do some prefetches. */ template <typename T> -static inline void prefetch_6x(const T *pfp) -{ - __asm __volatile( +static inline void prefetch_6x(const T *pfp) { + __asm __volatile ( ASM_PREFETCH("[%[pfp]]") ASM_PREFETCH("[%[pfp], #64]") ASM_PREFETCH("[%[pfp], #128]") ASM_PREFETCH("[%[pfp], #192]") ASM_PREFETCH("[%[pfp], #256]") ASM_PREFETCH("[%[pfp], #320]") - : - : [pfp] "r"(pfp) - : "memory"); + : + : [pfp] "r" (pfp) + : "memory" + ); } template <typename T> -static inline void prefetch_5x(const T *pfp) -{ - __asm __volatile( +static inline void prefetch_5x(const T *pfp) { + __asm __volatile ( ASM_PREFETCH("[%[pfp]]") ASM_PREFETCH("[%[pfp], #64]") ASM_PREFETCH("[%[pfp], #128]") ASM_PREFETCH("[%[pfp], #192]") ASM_PREFETCH("[%[pfp], #256]") - : - : [pfp] "r"(pfp) - : "memory"); + : + : [pfp] "r" (pfp) + : "memory" + ); } template <typename T> -static inline void prefetch_4x(const T *pfp) -{ - __asm __volatile( +static inline void prefetch_4x(const T *pfp) { + __asm __volatile ( ASM_PREFETCH("[%[pfp]]") ASM_PREFETCH("[%[pfp], #64]") ASM_PREFETCH("[%[pfp], #128]") ASM_PREFETCH("[%[pfp], #192]") - : - : [pfp] "r"(pfp) - : "memory"); + : + : [pfp] "r" (pfp) + : "memory" + ); } template <typename T> -static inline void prefetch_3x(const T *pfp) -{ - __asm __volatile( +static inline void prefetch_3x(const T *pfp) { + __asm __volatile ( ASM_PREFETCH("[%[pfp]]") ASM_PREFETCH("[%[pfp], #64]") ASM_PREFETCH("[%[pfp], #128]") - : - : [pfp] "r"(pfp) - : "memory"); + : + : [pfp] "r" (pfp) + : "memory" + ); } template <typename T> -static inline void prefetch_2x(const T *pfp) -{ - __asm __volatile( +static inline void prefetch_2x(const T *pfp) { + __asm __volatile ( ASM_PREFETCH("[%[pfp]]") ASM_PREFETCH("[%[pfp], #64]") - : - : [pfp] "r"(pfp) - : "memory"); + : + : [pfp] "r" (pfp) + : "memory" + ); } template <typename T> -static inline void prefetch_1x(const T *pfp) -{ - __asm __volatile( +static inline void prefetch_1x(const T *pfp) { + __asm __volatile ( ASM_PREFETCH("[%[pfp]]") - : - : [pfp] "r"(pfp) - : "memory"); + : + : [pfp] "r" (pfp) + : "memory" + ); } + |