aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
diff options
context:
space:
mode:
authorPablo Tello <pablo.tello@arm.com>2018-01-25 15:05:13 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:45:00 +0000
commit11c3b33215225f5baf34c045a68982e0058af74a (patch)
tree91c50c74f9abe6576d95b0044112479548b08226 /arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
parentfc9bda3d99cab79ffeab8ae329e0648a257472c2 (diff)
downloadComputeLibrary-11c3b33215225f5baf34c045a68982e0058af74a.tar.gz
COMPMID-861: updated RSH Gemm's transforms.
Change-Id: Ic1f215c1ae85ad5c516cc3600447a50bba77ebc1 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117668 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp')
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp21
1 files changed, 10 insertions, 11 deletions
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
index 6317424598..bd5125afab 100644
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,13 +25,12 @@
#ifdef __aarch64__
-#include "../asmlib.hpp"
-
#include <arm_neon.h>
+#include "asmlib.hpp"
template<>
template<typename T>
-void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
+inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
uint32_t *outptr = (uint32_t *)out;
const uint32_t *inptr = (uint32_t *)in;
@@ -92,47 +91,46 @@ void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin,
"ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
"LDP q8, q9, [%[inptr4]], #32\n"
"LDP q10, q11, [%[inptr5]], #32\n"
- ASM_PREFETCH("[%[inptr1], #128]")
"LDP q12, q13, [%[inptr6]], #32\n"
"ZIP1 v18.4s, v8.4s, v12.4s\n"
+ ASM_PREFETCH("[%[inptr1], #128]")
"LDP q14, q15, [%[inptr7]], #32\n"
"ZIP1 v19.4s, v10.4s, v14.4s\n"
- ASM_PREFETCH("[%[inptr2], #128]")
"ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
+ ASM_PREFETCH("[%[inptr2], #128]")
"ZIP1 v21.4s, v18.4s, v19.4s\n"
"ZIP2 v22.4s, v16.4s, v17.4s\n"
"ZIP2 v23.4s, v18.4s, v19.4s\n"
- ASM_PREFETCH("[%[inptr3], #128]")
"ZIP2 v16.4s, v0.4s, v4.4s\n"
+ ASM_PREFETCH("[%[inptr3], #128]")
"ZIP2 v17.4s, v2.4s, v6.4s\n"
"STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
"ZIP2 v18.4s, v8.4s, v12.4s\n"
- ASM_PREFETCH("[%[inptr4], #128]")
"ZIP2 v19.4s, v10.4s, v14.4s\n"
"STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
"ZIP1 v20.4s, v16.4s, v17.4s\n"
+ ASM_PREFETCH("[%[inptr4], #128]")
"ZIP1 v21.4s, v18.4s, v19.4s\n"
- ASM_PREFETCH("[%[inptr5], #128]")
"ZIP2 v22.4s, v16.4s, v17.4s\n"
"ZIP2 v23.4s, v18.4s, v19.4s\n"
"ZIP1 v16.4s, v1.4s, v5.4s\n"
+ ASM_PREFETCH("[%[inptr5], #128]")
"ZIP1 v17.4s, v3.4s, v7.4s\n"
- ASM_PREFETCH("[%[inptr6], #128]")
"STP q20, q21, [%[outptr]], #32\n" // Third element
"ZIP1 v18.4s, v9.4s, v13.4s\n"
"ZIP1 v19.4s, v11.4s, v15.4s\n"
"STP q22, q23, [%[outptr]], #32\n" // Fourth element
- ASM_PREFETCH("[%[inptr7], #128]")
"ZIP1 v20.4s, v16.4s, v17.4s\n"
"ZIP1 v21.4s, v18.4s, v19.4s\n"
"ZIP2 v22.4s, v16.4s, v17.4s\n"
+ ASM_PREFETCH("[%[inptr6], #128]")
"ZIP2 v23.4s, v18.4s, v19.4s\n"
"ZIP2 v16.4s, v1.4s, v5.4s\n"
@@ -140,6 +138,7 @@ void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin,
"STP q20, q21, [%[outptr]], #32\n" // Fifth element
"ZIP2 v18.4s, v9.4s, v13.4s\n"
+ ASM_PREFETCH("[%[inptr7], #128]")
"ZIP2 v19.4s, v11.4s, v15.4s\n"
"STP q22, q23, [%[outptr]], #32\n" // Sixth element