aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp')
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp21
1 files changed, 10 insertions, 11 deletions
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
index 6317424598..bd5125afab 100644
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,13 +25,12 @@
#ifdef __aarch64__
-#include "../asmlib.hpp"
-
#include <arm_neon.h>
+#include "asmlib.hpp"
template<>
template<typename T>
-void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
+inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
uint32_t *outptr = (uint32_t *)out;
const uint32_t *inptr = (uint32_t *)in;
@@ -92,47 +91,46 @@ void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin,
"ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
"LDP q8, q9, [%[inptr4]], #32\n"
"LDP q10, q11, [%[inptr5]], #32\n"
- ASM_PREFETCH("[%[inptr1], #128]")
"LDP q12, q13, [%[inptr6]], #32\n"
"ZIP1 v18.4s, v8.4s, v12.4s\n"
+ ASM_PREFETCH("[%[inptr1], #128]")
"LDP q14, q15, [%[inptr7]], #32\n"
"ZIP1 v19.4s, v10.4s, v14.4s\n"
- ASM_PREFETCH("[%[inptr2], #128]")
"ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
+ ASM_PREFETCH("[%[inptr2], #128]")
"ZIP1 v21.4s, v18.4s, v19.4s\n"
"ZIP2 v22.4s, v16.4s, v17.4s\n"
"ZIP2 v23.4s, v18.4s, v19.4s\n"
- ASM_PREFETCH("[%[inptr3], #128]")
"ZIP2 v16.4s, v0.4s, v4.4s\n"
+ ASM_PREFETCH("[%[inptr3], #128]")
"ZIP2 v17.4s, v2.4s, v6.4s\n"
"STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
"ZIP2 v18.4s, v8.4s, v12.4s\n"
- ASM_PREFETCH("[%[inptr4], #128]")
"ZIP2 v19.4s, v10.4s, v14.4s\n"
"STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
"ZIP1 v20.4s, v16.4s, v17.4s\n"
+ ASM_PREFETCH("[%[inptr4], #128]")
"ZIP1 v21.4s, v18.4s, v19.4s\n"
- ASM_PREFETCH("[%[inptr5], #128]")
"ZIP2 v22.4s, v16.4s, v17.4s\n"
"ZIP2 v23.4s, v18.4s, v19.4s\n"
"ZIP1 v16.4s, v1.4s, v5.4s\n"
+ ASM_PREFETCH("[%[inptr5], #128]")
"ZIP1 v17.4s, v3.4s, v7.4s\n"
- ASM_PREFETCH("[%[inptr6], #128]")
"STP q20, q21, [%[outptr]], #32\n" // Third element
"ZIP1 v18.4s, v9.4s, v13.4s\n"
"ZIP1 v19.4s, v11.4s, v15.4s\n"
"STP q22, q23, [%[outptr]], #32\n" // Fourth element
- ASM_PREFETCH("[%[inptr7], #128]")
"ZIP1 v20.4s, v16.4s, v17.4s\n"
"ZIP1 v21.4s, v18.4s, v19.4s\n"
"ZIP2 v22.4s, v16.4s, v17.4s\n"
+ ASM_PREFETCH("[%[inptr6], #128]")
"ZIP2 v23.4s, v18.4s, v19.4s\n"
"ZIP2 v16.4s, v1.4s, v5.4s\n"
@@ -140,6 +138,7 @@ void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin,
"STP q20, q21, [%[outptr]], #32\n" // Fifth element
"ZIP2 v18.4s, v9.4s, v13.4s\n"
+ ASM_PREFETCH("[%[inptr7], #128]")
"ZIP2 v19.4s, v11.4s, v15.4s\n"
"STP q22, q23, [%[outptr]], #32\n" // Sixth element